All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Pages
Rule.h
Go to the documentation of this file.
1 // Copyright 2015, 2016 Thomas Trapp
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef HEXT_RULE_H_INCLUDED
16 #define HEXT_RULE_H_INCLUDED
17 
18 /// @file
19 /// Declares hext::Rule
20 
21 #include "hext/Html.h"
22 #include "hext/HtmlTag.h"
23 #include "hext/Result.h"
24 #include "hext/Match.h"
25 #include "hext/Capture.h"
26 
27 #include <cstddef>
28 #include <memory>
29 #include <vector>
30 
31 #include <gumbo.h>
32 
33 
34 namespace hext {
35 
36 
37 /// Extracts values from HTML.
38 ///
39 /// A Rule defines how to match and capture HTML nodes. It can be applied to a
40 /// GumboNode tree, where it recursively tries to find matches.
41 ///
42 /// @par Example:
43 /// ~~~~~~~~~~~~~
44 /// // create a rule that matches anchor elements, ..
45 /// Rule anchor(HtmlTag::A);
46 /// // .. which must have an attribute called "href"
47 /// anchor.append_match<AttributeMatch>("href")
48 /// // capture attribute href and save it as "link"
49 /// .append_capture<AttributeCapture>("href", "link");
50 ///
51 /// {
52 /// // create a rule that matches image elements
53 /// Rule img(HtmlTag::IMG);
54 /// // capture attribute src and save it as "img"
55 /// img.append_capture<AttributeCapture>("src", "img");
56 /// // append the image-rule to the anchor-rule
57 /// anchor.append_child(std::move(img));
58 /// }
59 ///
60 /// // anchor is now equivalent to the following hext:
61 /// // <a href:link><img src:img/></a>
62 ///
63 /// Html html(
64 /// "<div><a href='/bob'> <img src='bob.jpg'/> </a></div>"
65 /// "<div><a href='/alice'><img src='alice.jpg'/></a></div>"
66 /// "<div><a href='/carol'><img src='carol.jpg'/></a></div>");
67 ///
68 /// hext::Result result = anchor.extract(html);
69 /// // result will be equivalent to this:
70 /// // vector{
71 /// // map{
72 /// // {"link", "/bob"}
73 /// // {"img", "bob.jpg"}
74 /// // },
75 /// // map{
76 /// // {"link", "/alice"}
77 /// // {"img", "alice.jpg"}
78 /// // },
79 /// // map{
80 /// // {"link", "/carol"}
81 /// // {"img", "carol.jpg"}
82 /// // },
83 /// // }
84 /// ~~~~~~~~~~~~~
85 class Rule
86 {
87 public:
88  /// Constructs a Rule.
89  ///
90  /// @param tag: The HtmlTag that this rule matches.
91  /// Default: Match any tag.
92  /// @param optional: A subtree matches only if all mandatory rules were
93  /// matched. Optional rules on the other hand are ignored
94  /// if not found.
95  /// Default: Rule is mandatory.
96  explicit Rule(HtmlTag tag = HtmlTag::ANY,
97  bool optional = false) noexcept;
98 
99  ~Rule() noexcept = default;
100  Rule(Rule&&) noexcept = default;
101  Rule(const Rule& other);
102  Rule& operator=(Rule&&) noexcept = default;
103  Rule& operator=(const Rule& other);
104 
105  /// Returns the first child or nullptr if childless.
106  const Rule * child() const noexcept;
107 
108  /// Returns the next rule or nullptr if no following rule.
109  const Rule * next() const noexcept;
110 
111  /// Returns the first child or nullptr if childless.
112  Rule * child() noexcept;
113 
114  /// Returns the next rule or nullptr if no following rule.
115  Rule * next() noexcept;
116 
117  /// Appends a child.
118  ///
119  /// @param new_child: The Rule to append.
120  /// @returns A reference for this Rule to enable method chaining.
121  Rule& append_child(Rule new_child);
122 
123  /// Appends a following Rule.
124  ///
125  /// @param sibling: The Rule to append.
126  /// @returns A reference for this Rule to enable method chaining.
127  Rule& append_next(Rule sibling);
128 
129  /// Appends a Match.
130  ///
131  /// @param match: The Match to append.
132  /// @returns A reference for this Rule to enable method chaining.
133  Rule& append_match(std::unique_ptr<Match> match);
134 
135  /// Emplaces a Match.
136  /// Forwards arguments to std::make_unique.
137  ///
138  /// @returns A reference for this Rule to enable method chaining.
139  template<typename MatchType, typename... Args>
140  Rule& append_match(Args&&... arg)
141  {
142  return this->append_match(
143  std::make_unique<MatchType>(std::forward<Args>(arg)...));
144  }
145 
146  /// Appends a Capture.
147  ///
148  /// @param cap: The Capture to append.
149  /// @returns A reference for this Rule to enable method chaining.
150  Rule& append_capture(std::unique_ptr<Capture> cap);
151 
152  /// Emplaces a Capture.
153  /// Forwards arguments to std::make_unique.
154  ///
155  /// @returns A reference for this Rule to enable method chaining.
156  template<typename CaptureType, typename... Args>
157  Rule& append_capture(Args&&... arg)
158  {
159  return this->append_capture(
160  std::make_unique<CaptureType>(std::forward<Args>(arg)...));
161  }
162 
163  /// Returns the HtmlTag this rule matches.
164  HtmlTag get_tag() const noexcept;
165 
166  /// Sets the HtmlTag this rule matches.
167  ///
168  /// @returns A reference for this Rule to enable method chaining.
169  Rule& set_tag(HtmlTag tag) noexcept;
170 
171  /// Returns true if this rule is optional, i.e. if a match has to be found.
172  bool is_optional() const noexcept;
173 
174  /// Sets whether this rule is optional, i.e. if a match has to be found.
175  ///
176  /// @returns A reference for this Rule to enable method chaining.
177  Rule& set_optional(bool optional) noexcept;
178 
179  /// Recursively extracts values from an hext::HTML.
180  ///
181  /// @returns A vector containing maps filled with the captured
182  /// name value pairs.
183  hext::Result extract(const Html& html) const;
184 
185  /// Recursively extracts values from a GumboNode.
186  ///
187  /// @returns A vector containing maps filled with the captured
188  /// name value pairs.
189  hext::Result extract(const GumboNode * node) const;
190 
191  /// Returns true if this Rule matches node.
192  ///
193  /// @param node: A GumboNode that is to be matched.
194  bool matches(const GumboNode * node) const;
195 
196  /// Returns the result of applying every Capture to node.
197  ///
198  /// @param node: A GumboNode that is to be captured.
199  std::vector<ResultPair> capture(const GumboNode * node) const;
200 
201 private:
202  void swap(hext::Rule& other) noexcept;
203 
204  std::unique_ptr<Rule> first_child_;
205  std::unique_ptr<Rule> next_;
206  std::vector<std::unique_ptr<Match>> matches_;
207  std::vector<std::unique_ptr<Capture>> captures_;
208 
209  HtmlTag tag_;
210  bool is_optional_;
211 };
212 
213 
214 } // namespace hext
215 
216 
217 #endif // HEXT_RULE_H_INCLUDED
218 
HtmlTag
An enum containing all valid HTML tags.
Definition: HtmlTag.h:28
Rule & append_child(Rule new_child)
Appends a child.
hext::Result extract(const Html &html) const
Recursively extracts values from an hext::HTML.
std::vector< ResultMap > Result
A vector containing ResultMap.
Definition: Result.h:45
std::vector< ResultPair > capture(const GumboNode *node) const
Returns the result of applying every Capture to node.
Abstract base for every Match.
Definition: Match.h:39
All valid HTML tags.
Declares hext::Html.
Rule & append_match(std::unique_ptr< Match > match)
Appends a Match.
Rule & set_optional(bool optional) noexcept
Sets whether this rule is optional, i.e.
Rule & append_match(Args &&...arg)
Emplaces a Match.
Definition: Rule.h:140
Rule & append_capture(Args &&...arg)
Emplaces a Capture.
Definition: Rule.h:157
Extracts values from HTML.
Definition: Rule.h:85
Typedefs for results returned from capturing HTML.
Abstract base for every Capture.
Definition: Capture.h:44
Rule & operator=(Rule &&) noexcept=default
Rule & append_next(Rule sibling)
Appends a following Rule.
bool matches(const GumboNode *node) const
Returns true if this Rule matches node.
const Rule * next() const noexcept
Returns the next rule or nullptr if no following rule.
Declares hext::Match.
const Rule * child() const noexcept
Returns the first child or nullptr if childless.
Rule(HtmlTag tag=HtmlTag::ANY, bool optional=false) noexcept
Constructs a Rule.
~Rule() noexcept=default
bool is_optional() const noexcept
Returns true if this rule is optional, i.e. if a match has to be found.
Rule & set_tag(HtmlTag tag) noexcept
Sets the HtmlTag this rule matches.
HtmlTag get_tag() const noexcept
Returns the HtmlTag this rule matches.
Any html tag.
Rule & append_capture(std::unique_ptr< Capture > cap)
Appends a Capture.
std::pair< std::string, std::string > ResultPair
A string-pair containing a name and a value.
Definition: Result.h:32
Declares hext::Capture.
A RAII wrapper for Gumbo.
Definition: Html.h:44