libhext: C++ Library Documentation 1.0.13-b24695d
Loading...
Searching...
No Matches
Rule.h
Go to the documentation of this file.
1// Copyright 2015-2021 Thomas Trapp
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#ifndef HEXT_RULE_H_INCLUDED
16#define HEXT_RULE_H_INCLUDED
17
18/// @file
19/// Declares hext::Rule
20
21#include "hext/Html.h"
22#include "hext/HtmlTag.h"
23#include "hext/Result.h"
24#include "hext/Match.h"
25#include "hext/Capture.h"
26#include "hext/Visibility.h"
27
28#include <cstddef>
29#include <cstdint>
30#include <memory>
31#include <optional>
32#include <string>
33#include <vector>
34
35#include <gumbo.h>
36
37
38namespace hext {
39
40
41/// Extracts values from HTML.
42///
43/// A Rule defines how to match and capture HTML nodes. It can be applied to a
44/// GumboNode tree, where it recursively tries to find matches.
45///
46/// @par Example:
47/// ~~~~~~~~~~~~~
48/// // create a rule that matches anchor elements, ..
49/// Rule anchor(HtmlTag::A);
50/// // .. which must have an attribute called "href"
51/// anchor.append_match<AttributeMatch>("href")
52/// // capture attribute href and save it as "link"
53/// .append_capture<AttributeCapture>("href", "link");
54///
55/// {
56/// // create a rule that matches image elements
57/// Rule img(HtmlTag::IMG);
58/// // capture attribute src and save it as "img"
59/// img.append_capture<AttributeCapture>("src", "img");
60/// // append the image-rule to the anchor-rule
61/// anchor.append_child(std::move(img));
62/// }
63///
64/// // anchor is now equivalent to the following hext:
65/// // <a href:link><img src:img/></a>
66///
67/// Html html(
68/// "<div><a href='/bob'> <img src='bob.jpg'/> </a></div>"
69/// "<div><a href='/alice'><img src='alice.jpg'/></a></div>"
70/// "<div><a href='/carol'><img src='carol.jpg'/></a></div>");
71///
72/// hext::Result result = anchor.extract(html);
73/// // result will be equivalent to this:
74/// // vector{
75/// // map{
76/// // {"link", "/bob"}
77/// // {"img", "bob.jpg"}
78/// // },
79/// // map{
80/// // {"link", "/alice"}
81/// // {"img", "alice.jpg"}
82/// // },
83/// // map{
84/// // {"link", "/carol"}
85/// // {"img", "carol.jpg"}
86/// // },
87/// // }
88/// ~~~~~~~~~~~~~
90{
91public:
92 /// Constructs a Rule with a known HTML tag.
93 ///
94 /// @param tag: The HtmlTag that this rule matches.
95 /// Default: Match any tag.
96 /// @param optional: A subtree matches only if all mandatory rules were
97 /// matched. Optional rules on the other hand are ignored
98 /// if not found.
99 /// Default: Rule is mandatory.
100 /// @param greedy: Whether this rule should be repeated once a match is
101 /// found.
102 /// Default: Rule is matched once.
103 explicit Rule(HtmlTag tag = HtmlTag::ANY,
104 bool optional = false,
105 bool greedy = false) noexcept;
106
107 /// Constructs a Rule with the HTML tag given as a string.
108 ///
109 /// @param tag: The HTML tagname that this rule matches.
110 /// Custom/unknown HTML tags are allowed.
111 /// If the tagname is a standard-HTML tag, it is converted
112 /// to an HtmlTag.
113 /// @param optional: A subtree matches only if all mandatory rules were
114 /// matched. Optional rules on the other hand are ignored
115 /// if not found.
116 /// Default: Rule is mandatory.
117 /// @param greedy: Whether this rule should be repeated once a match is
118 /// found.
119 /// Default: Rule is matched once.
120 explicit Rule(std::string tag,
121 bool optional = false,
122 bool greedy = false) noexcept;
123
124 ~Rule() noexcept = default;
125 Rule(Rule&&) noexcept = default;
126 Rule(const Rule& other);
127 Rule& operator=(Rule&&) noexcept = default;
128 Rule& operator=(const Rule& other);
129
130 /// Returns the child or nullptr if childless.
131 const Rule * child() const noexcept;
132
133 /// Returns the next rule or nullptr if no following rule.
134 const Rule * next() const noexcept;
135
136 /// Returns the nested rules.
137 const std::vector<Rule>& nested() const noexcept;
138
139 /// Returns the child or nullptr if childless.
140 Rule * child() noexcept;
141
142 /// Returns the next rule or nullptr if no following rule.
143 Rule * next() noexcept;
144
145 /// Returns the nested rules.
146 std::vector<Rule>& nested() noexcept;
147
148 /// Appends a child.
149 ///
150 /// @param new_child: The Rule to append.
151 /// @returns A reference for this Rule to enable method chaining.
152 Rule& append_child(Rule new_child);
153
154 /// Appends a following Rule.
155 ///
156 /// @param sibling: The Rule to append.
157 /// @returns A reference for this Rule to enable method chaining.
158 Rule& append_next(Rule sibling);
159
160 /// Appends a nested Rule.
161 ///
162 /// @param nested: The Rule to append.
163 /// @returns A reference for this Rule to enable method chaining.
164 Rule& append_nested(Rule nested);
165
166 /// Appends a Match.
167 ///
168 /// @param match: The Match to append.
169 /// @returns A reference for this Rule to enable method chaining.
170 Rule& append_match(std::unique_ptr<Match> match);
171
172 /// Emplaces a Match.
173 /// Forwards arguments to std::make_unique.
174 ///
175 /// @returns A reference for this Rule to enable method chaining.
176 template<typename MatchType, typename... Args>
177 Rule& append_match(Args&&... arg)
178 {
179 return this->append_match(
180 std::make_unique<MatchType>(std::forward<Args>(arg)...));
181 }
182
183 /// Appends a Capture.
184 ///
185 /// @param cap: The Capture to append.
186 /// @returns A reference for this Rule to enable method chaining.
187 Rule& append_capture(std::unique_ptr<Capture> cap);
188
189 /// Emplaces a Capture.
190 /// Forwards arguments to std::make_unique.
191 ///
192 /// @returns A reference for this Rule to enable method chaining.
193 template<typename CaptureType, typename... Args>
194 Rule& append_capture(Args&&... arg)
195 {
196 return this->append_capture(
197 std::make_unique<CaptureType>(std::forward<Args>(arg)...));
198 }
199
200 /// Returns the HtmlTag this rule matches.
201 HtmlTag get_tag() const noexcept;
202
203 /// Sets the HtmlTag this rule matches.
204 ///
205 /// @returns A reference for this Rule to enable method chaining.
206 Rule& set_tag(HtmlTag tag) noexcept;
207
208 /// Returns true if this rule is optional, i.e. if a match has to be found.
209 bool is_optional() const noexcept;
210
211 /// Sets whether this rule is optional, i.e. if a match has to be found.
212 ///
213 /// @returns A reference for this Rule to enable method chaining.
214 Rule& set_optional(bool optional) noexcept;
215
216 /// Returns true if this rule is to be matched repeatedly.
217 bool is_greedy() const noexcept;
218
219 /// Sets whether this rule is to be matched repeatedly.
220 ///
221 /// @returns A reference for this Rule to enable method chaining.
222 Rule& set_greedy(bool greedy) noexcept;
223
224 /// Get custom HTML tag name.
225 ///
226 /// @returns Empty optional if no custom HTML tag name.
227 std::optional<std::string> get_tagname() const;
228
229 /// Set custom HTML tag name.
230 ///
231 /// @note The HTML tag name is only matched if this Rule's HtmlTag equals
232 /// HtmlTag::UNKNOWN.
233 /// @returns A reference for this Rule to enable method chaining.
234 Rule& set_tagname(const std::string& tagname);
235
236 /// Recursively extracts values from an hext::HTML.
237 ///
238 /// @param max_searches: Abort extraction by throwing a `MaxSearchError`
239 /// after doing this amount of searches in the
240 /// given Html.
241 /// @returns A vector containing maps filled with the captured
242 /// name value pairs.
243 hext::Result extract(const Html& html,
244 std::uint64_t max_searches = 0) const;
245
246 /// Recursively extracts values from a GumboNode.
247 ///
248 /// @param max_searches: Abort extraction by throwing a `MaxSearchError`
249 /// after doing this amount of searches in the
250 /// given GumboNode.
251 /// @returns A vector containing maps filled with the captured
252 /// name value pairs.
253 hext::Result extract(const GumboNode * node,
254 std::uint64_t max_searches = 0) const;
255
256 /// Returns true if this Rule matches node.
257 ///
258 /// @param node: A GumboNode that is to be matched.
259 bool matches(const GumboNode * node) const;
260
261 /// Returns the result of applying every Capture to node.
262 ///
263 /// @param node: A GumboNode that is to be captured.
264 std::vector<ResultPair> capture(const GumboNode * node) const;
265
266private:
267 HEXT_PRIVATE void swap(hext::Rule& other) noexcept;
268
269 std::unique_ptr<Rule> first_child_;
270 std::unique_ptr<Rule> next_;
271 std::vector<Rule> nested_;
272 std::vector<std::unique_ptr<Match>> matches_;
273 std::vector<std::unique_ptr<Capture>> captures_;
274
275 HtmlTag tag_;
276 bool is_optional_;
277 bool is_greedy_;
278 std::optional<std::string> tagname_;
279};
280
281
282} // namespace hext
283
284
285#endif // HEXT_RULE_H_INCLUDED
286
Declares hext::Capture.
All valid HTML tags.
Declares hext::Html.
Declares hext::Match.
Typedefs for results returned from capturing HTML.
Defines HEXT_PUBLIC and HEXT_PRIVATE.
#define HEXT_PRIVATE
Definition Visibility.h:27
#define HEXT_PUBLIC
Definition Visibility.h:26
Abstract base for every Capture.
Definition Capture.h:45
A RAII wrapper for Gumbo.
Definition Html.h:47
Abstract base for every Match.
Definition Match.h:42
Extracts values from HTML.
Definition Rule.h:90
Rule & append_capture(std::unique_ptr< Capture > cap)
Appends a Capture.
Rule(HtmlTag tag=HtmlTag::ANY, bool optional=false, bool greedy=false) noexcept
Constructs a Rule with a known HTML tag.
HtmlTag get_tag() const noexcept
Returns the HtmlTag this rule matches.
Rule & append_capture(Args &&... arg)
Emplaces a Capture.
Definition Rule.h:194
HtmlTag
An enum containing all valid HTML tags.
Definition HtmlTag.h:31
std::pair< std::string, std::string > ResultPair
A string-pair containing a name and a value.
Definition Result.h:32
std::vector< ResultMap > Result
A vector containing ResultMap.
Definition Result.h:45