libhext: C++ Library Documentation  1.0.12-3ea013c
Rule.h
Go to the documentation of this file.
1 // Copyright 2015-2021 Thomas Trapp
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef HEXT_RULE_H_INCLUDED
16 #define HEXT_RULE_H_INCLUDED
17 
18 /// @file
19 /// Declares hext::Rule
20 
21 #include "hext/Html.h"
22 #include "hext/HtmlTag.h"
23 #include "hext/Result.h"
24 #include "hext/Match.h"
25 #include "hext/Capture.h"
26 #include "hext/Visibility.h"
27 
28 #include <cstddef>
29 #include <cstdint>
30 #include <memory>
31 #include <optional>
32 #include <string>
33 #include <vector>
34 
35 #include <gumbo.h>
36 
37 
38 namespace hext {
39 
40 
41 /// Extracts values from HTML.
42 ///
43 /// A Rule defines how to match and capture HTML nodes. It can be applied to a
44 /// GumboNode tree, where it recursively tries to find matches.
45 ///
46 /// @par Example:
47 /// ~~~~~~~~~~~~~
48 /// // create a rule that matches anchor elements, ..
49 /// Rule anchor(HtmlTag::A);
50 /// // .. which must have an attribute called "href"
51 /// anchor.append_match<AttributeMatch>("href")
52 /// // capture attribute href and save it as "link"
53 /// .append_capture<AttributeCapture>("href", "link");
54 ///
55 /// {
56 /// // create a rule that matches image elements
57 /// Rule img(HtmlTag::IMG);
58 /// // capture attribute src and save it as "img"
59 /// img.append_capture<AttributeCapture>("src", "img");
60 /// // append the image-rule to the anchor-rule
61 /// anchor.append_child(std::move(img));
62 /// }
63 ///
64 /// // anchor is now equivalent to the following hext:
65 /// // <a href:link><img src:img/></a>
66 ///
67 /// Html html(
68 /// "<div><a href='/bob'> <img src='bob.jpg'/> </a></div>"
69 /// "<div><a href='/alice'><img src='alice.jpg'/></a></div>"
70 /// "<div><a href='/carol'><img src='carol.jpg'/></a></div>");
71 ///
72 /// hext::Result result = anchor.extract(html);
73 /// // result will be equivalent to this:
74 /// // vector{
75 /// // map{
76 /// // {"link", "/bob"}
77 /// // {"img", "bob.jpg"}
78 /// // },
79 /// // map{
80 /// // {"link", "/alice"}
81 /// // {"img", "alice.jpg"}
82 /// // },
83 /// // map{
84 /// // {"link", "/carol"}
85 /// // {"img", "carol.jpg"}
86 /// // },
87 /// // }
88 /// ~~~~~~~~~~~~~
90 {
91 public:
92  /// Constructs a Rule with a known HTML tag.
93  ///
94  /// @param tag: The HtmlTag that this rule matches.
95  /// Default: Match any tag.
96  /// @param optional: A subtree matches only if all mandatory rules were
97  /// matched. Optional rules on the other hand are ignored
98  /// if not found.
99  /// Default: Rule is mandatory.
100  /// @param greedy: Whether this rule should be repeated once a match is
101  /// found.
102  /// Default: Rule is matched once.
103  explicit Rule(HtmlTag tag = HtmlTag::ANY,
104  bool optional = false,
105  bool greedy = false) noexcept;
106 
107  /// Constructs a Rule with the HTML tag given as a string.
108  ///
109  /// @param tag: The HTML tagname that this rule matches.
110  /// Custom/unknown HTML tags are allowed.
111  /// If the tagname is a standard-HTML tag, it is converted
112  /// to an HtmlTag.
113  /// @param optional: A subtree matches only if all mandatory rules were
114  /// matched. Optional rules on the other hand are ignored
115  /// if not found.
116  /// Default: Rule is mandatory.
117  /// @param greedy: Whether this rule should be repeated once a match is
118  /// found.
119  /// Default: Rule is matched once.
120  explicit Rule(std::string tag,
121  bool optional = false,
122  bool greedy = false) noexcept;
123 
124  ~Rule() noexcept = default;
125  Rule(Rule&&) noexcept = default;
126  Rule(const Rule& other);
127  Rule& operator=(Rule&&) noexcept = default;
128  Rule& operator=(const Rule& other);
129 
130  /// Returns the child or nullptr if childless.
131  const Rule * child() const noexcept;
132 
133  /// Returns the next rule or nullptr if no following rule.
134  const Rule * next() const noexcept;
135 
136  /// Returns the nested rules.
137  const std::vector<Rule>& nested() const noexcept;
138 
139  /// Returns the child or nullptr if childless.
140  Rule * child() noexcept;
141 
142  /// Returns the next rule or nullptr if no following rule.
143  Rule * next() noexcept;
144 
145  /// Returns the nested rules.
146  std::vector<Rule>& nested() noexcept;
147 
148  /// Appends a child.
149  ///
150  /// @param new_child: The Rule to append.
151  /// @returns A reference for this Rule to enable method chaining.
152  Rule& append_child(Rule new_child);
153 
154  /// Appends a following Rule.
155  ///
156  /// @param sibling: The Rule to append.
157  /// @returns A reference for this Rule to enable method chaining.
158  Rule& append_next(Rule sibling);
159 
160  /// Appends a nested Rule.
161  ///
162  /// @param nested: The Rule to append.
163  /// @returns A reference for this Rule to enable method chaining.
164  Rule& append_nested(Rule nested);
165 
166  /// Appends a Match.
167  ///
168  /// @param match: The Match to append.
169  /// @returns A reference for this Rule to enable method chaining.
170  Rule& append_match(std::unique_ptr<Match> match);
171 
172  /// Emplaces a Match.
173  /// Forwards arguments to std::make_unique.
174  ///
175  /// @returns A reference for this Rule to enable method chaining.
176  template<typename MatchType, typename... Args>
177  Rule& append_match(Args&&... arg)
178  {
179  return this->append_match(
180  std::make_unique<MatchType>(std::forward<Args>(arg)...));
181  }
182 
183  /// Appends a Capture.
184  ///
185  /// @param cap: The Capture to append.
186  /// @returns A reference for this Rule to enable method chaining.
187  Rule& append_capture(std::unique_ptr<Capture> cap);
188 
189  /// Emplaces a Capture.
190  /// Forwards arguments to std::make_unique.
191  ///
192  /// @returns A reference for this Rule to enable method chaining.
193  template<typename CaptureType, typename... Args>
194  Rule& append_capture(Args&&... arg)
195  {
196  return this->append_capture(
197  std::make_unique<CaptureType>(std::forward<Args>(arg)...));
198  }
199 
200  /// Returns the HtmlTag this rule matches.
201  HtmlTag get_tag() const noexcept;
202 
203  /// Sets the HtmlTag this rule matches.
204  ///
205  /// @returns A reference for this Rule to enable method chaining.
206  Rule& set_tag(HtmlTag tag) noexcept;
207 
208  /// Returns true if this rule is optional, i.e. if a match has to be found.
209  bool is_optional() const noexcept;
210 
211  /// Sets whether this rule is optional, i.e. if a match has to be found.
212  ///
213  /// @returns A reference for this Rule to enable method chaining.
214  Rule& set_optional(bool optional) noexcept;
215 
216  /// Returns true if this rule is to be matched repeatedly.
217  bool is_greedy() const noexcept;
218 
219  /// Sets whether this rule is to be matched repeatedly.
220  ///
221  /// @returns A reference for this Rule to enable method chaining.
222  Rule& set_greedy(bool greedy) noexcept;
223 
224  /// Get custom HTML tag name.
225  ///
226  /// @returns Empty optional if no custom HTML tag name.
227  std::optional<std::string> get_tagname() const;
228 
229  /// Set custom HTML tag name.
230  ///
231  /// @note The HTML tag name is only matched if this Rule's HtmlTag equals
232  /// HtmlTag::UNKNOWN.
233  /// @returns A reference for this Rule to enable method chaining.
234  Rule& set_tagname(const std::string& tagname);
235 
236  /// Recursively extracts values from an hext::HTML.
237  ///
238  /// @param max_searches: Abort extraction by throwing a `MaxSearchError`
239  /// after doing this amount of searches in the
240  /// given Html.
241  /// @returns A vector containing maps filled with the captured
242  /// name value pairs.
243  hext::Result extract(const Html& html,
244  std::uint64_t max_searches = 0) const;
245 
246  /// Recursively extracts values from a GumboNode.
247  ///
248  /// @param max_searches: Abort extraction by throwing a `MaxSearchError`
249  /// after doing this amount of searches in the
250  /// given GumboNode.
251  /// @returns A vector containing maps filled with the captured
252  /// name value pairs.
253  hext::Result extract(const GumboNode * node,
254  std::uint64_t max_searches = 0) const;
255 
256  /// Returns true if this Rule matches node.
257  ///
258  /// @param node: A GumboNode that is to be matched.
259  bool matches(const GumboNode * node) const;
260 
261  /// Returns the result of applying every Capture to node.
262  ///
263  /// @param node: A GumboNode that is to be captured.
264  std::vector<ResultPair> capture(const GumboNode * node) const;
265 
266 private:
267  HEXT_PRIVATE void swap(hext::Rule& other) noexcept;
268 
269  std::unique_ptr<Rule> first_child_;
270  std::unique_ptr<Rule> next_;
271  std::vector<Rule> nested_;
272  std::vector<std::unique_ptr<Match>> matches_;
273  std::vector<std::unique_ptr<Capture>> captures_;
274 
275  HtmlTag tag_;
276  bool is_optional_;
277  bool is_greedy_;
278  std::optional<std::string> tagname_;
279 };
280 
281 
282 } // namespace hext
283 
284 
285 #endif // HEXT_RULE_H_INCLUDED
286 
Declares hext::Capture.
All valid HTML tags.
Declares hext::Html.
Declares hext::Match.
Typedefs for results returned from capturing HTML.
Defines HEXT_PUBLIC and HEXT_PRIVATE.
#define HEXT_PRIVATE
Definition: Visibility.h:27
#define HEXT_PUBLIC
Definition: Visibility.h:26
Abstract base for every Capture.
Definition: Capture.h:45
A RAII wrapper for Gumbo.
Definition: Html.h:47
Abstract base for every Match.
Definition: Match.h:42
Extracts values from HTML.
Definition: Rule.h:90
Rule & append_capture(Args &&... arg)
Emplaces a Capture.
Definition: Rule.h:194
Rule(HtmlTag tag=HtmlTag::ANY, bool optional=false, bool greedy=false) noexcept
Constructs a Rule with a known HTML tag.
HtmlTag get_tag() const noexcept
Returns the HtmlTag this rule matches.
Rule & append_capture(std::unique_ptr< Capture > cap)
Appends a Capture.
HtmlTag
An enum containing all valid HTML tags.
Definition: HtmlTag.h:31
@ ANY
Any html tag.
std::pair< std::string, std::string > ResultPair
A string-pair containing a name and a value.
Definition: Result.h:32
std::vector< ResultMap > Result
A vector containing ResultMap.
Definition: Result.h:45