All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Rule.h
Go to the documentation of this file.
1 // Copyright 2015, 2016 Thomas Trapp
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef HEXT_RULE_H_INCLUDED
16 #define HEXT_RULE_H_INCLUDED
17 
18 /// @file
19 /// Declares hext::Rule
20 
21 #include "hext/Html.h"
22 #include "hext/HtmlTag.h"
23 #include "hext/Result.h"
24 #include "hext/Match.h"
25 #include "hext/Capture.h"
26 #include "hext/Visibility.h"
27 
28 #include <cstddef>
29 #include <memory>
30 #include <vector>
31 
32 #include <gumbo.h>
33 
34 
35 namespace hext {
36 
37 
38 /// Extracts values from HTML.
39 ///
40 /// A Rule defines how to match and capture HTML nodes. It can be applied to a
41 /// GumboNode tree, where it recursively tries to find matches.
42 ///
43 /// @par Example:
44 /// ~~~~~~~~~~~~~
45 /// // create a rule that matches anchor elements, ..
46 /// Rule anchor(HtmlTag::A);
47 /// // .. which must have an attribute called "href"
48 /// anchor.append_match<AttributeMatch>("href")
49 /// // capture attribute href and save it as "link"
50 /// .append_capture<AttributeCapture>("href", "link");
51 ///
52 /// {
53 /// // create a rule that matches image elements
54 /// Rule img(HtmlTag::IMG);
55 /// // capture attribute src and save it as "img"
56 /// img.append_capture<AttributeCapture>("src", "img");
57 /// // append the image-rule to the anchor-rule
58 /// anchor.append_child(std::move(img));
59 /// }
60 ///
61 /// // anchor is now equivalent to the following hext:
62 /// // <a href:link><img src:img/></a>
63 ///
64 /// Html html(
65 /// "<div><a href='/bob'> <img src='bob.jpg'/> </a></div>"
66 /// "<div><a href='/alice'><img src='alice.jpg'/></a></div>"
67 /// "<div><a href='/carol'><img src='carol.jpg'/></a></div>");
68 ///
69 /// hext::Result result = anchor.extract(html);
70 /// // result will be equivalent to this:
71 /// // vector{
72 /// // map{
73 /// // {"link", "/bob"}
74 /// // {"img", "bob.jpg"}
75 /// // },
76 /// // map{
77 /// // {"link", "/alice"}
78 /// // {"img", "alice.jpg"}
79 /// // },
80 /// // map{
81 /// // {"link", "/carol"}
82 /// // {"img", "carol.jpg"}
83 /// // },
84 /// // }
85 /// ~~~~~~~~~~~~~
87 {
88 public:
89  /// Constructs a Rule.
90  ///
91  /// @param tag: The HtmlTag that this rule matches.
92  /// Default: Match any tag.
93  /// @param optional: A subtree matches only if all mandatory rules were
94  /// matched. Optional rules on the other hand are ignored
95  /// if not found.
96  /// Default: Rule is mandatory.
97  explicit Rule(HtmlTag tag = HtmlTag::ANY,
98  bool optional = false) noexcept;
99 
100  ~Rule() noexcept = default;
101  Rule(Rule&&) noexcept = default;
102  Rule(const Rule& other);
103  Rule& operator=(Rule&&) noexcept = default;
104  Rule& operator=(const Rule& other);
105 
106  /// Returns the first child or nullptr if childless.
107  const Rule * child() const noexcept;
108 
109  /// Returns the next rule or nullptr if no following rule.
110  const Rule * next() const noexcept;
111 
112  /// Returns the first child or nullptr if childless.
113  Rule * child() noexcept;
114 
115  /// Returns the next rule or nullptr if no following rule.
116  Rule * next() noexcept;
117 
118  /// Appends a child.
119  ///
120  /// @param new_child: The Rule to append.
121  /// @returns A reference for this Rule to enable method chaining.
122  Rule& append_child(Rule new_child);
123 
124  /// Appends a following Rule.
125  ///
126  /// @param sibling: The Rule to append.
127  /// @returns A reference for this Rule to enable method chaining.
128  Rule& append_next(Rule sibling);
129 
130  /// Appends a Match.
131  ///
132  /// @param match: The Match to append.
133  /// @returns A reference for this Rule to enable method chaining.
134  Rule& append_match(std::unique_ptr<Match> match);
135 
136  /// Emplaces a Match.
137  /// Forwards arguments to std::make_unique.
138  ///
139  /// @returns A reference for this Rule to enable method chaining.
140  template<typename MatchType, typename... Args>
141  Rule& append_match(Args&&... arg)
142  {
143  return this->append_match(
144  std::make_unique<MatchType>(std::forward<Args>(arg)...));
145  }
146 
147  /// Appends a Capture.
148  ///
149  /// @param cap: The Capture to append.
150  /// @returns A reference for this Rule to enable method chaining.
151  Rule& append_capture(std::unique_ptr<Capture> cap);
152 
153  /// Emplaces a Capture.
154  /// Forwards arguments to std::make_unique.
155  ///
156  /// @returns A reference for this Rule to enable method chaining.
157  template<typename CaptureType, typename... Args>
158  Rule& append_capture(Args&&... arg)
159  {
160  return this->append_capture(
161  std::make_unique<CaptureType>(std::forward<Args>(arg)...));
162  }
163 
164  /// Returns the HtmlTag this rule matches.
165  HtmlTag get_tag() const noexcept;
166 
167  /// Sets the HtmlTag this rule matches.
168  ///
169  /// @returns A reference for this Rule to enable method chaining.
170  Rule& set_tag(HtmlTag tag) noexcept;
171 
172  /// Returns true if this rule is optional, i.e. if a match has to be found.
173  bool is_optional() const noexcept;
174 
175  /// Sets whether this rule is optional, i.e. if a match has to be found.
176  ///
177  /// @returns A reference for this Rule to enable method chaining.
178  Rule& set_optional(bool optional) noexcept;
179 
180  /// Recursively extracts values from an hext::HTML.
181  ///
182  /// @returns A vector containing maps filled with the captured
183  /// name value pairs.
184  hext::Result extract(const Html& html) const;
185 
186  /// Recursively extracts values from a GumboNode.
187  ///
188  /// @returns A vector containing maps filled with the captured
189  /// name value pairs.
190  hext::Result extract(const GumboNode * node) const;
191 
192  /// Returns true if this Rule matches node.
193  ///
194  /// @param node: A GumboNode that is to be matched.
195  bool matches(const GumboNode * node) const;
196 
197  /// Returns the result of applying every Capture to node.
198  ///
199  /// @param node: A GumboNode that is to be captured.
200  std::vector<ResultPair> capture(const GumboNode * node) const;
201 
202 private:
203  HEXT_PRIVATE void swap(hext::Rule& other) noexcept;
204 
205  std::unique_ptr<Rule> first_child_;
206  std::unique_ptr<Rule> next_;
207  std::vector<std::unique_ptr<Match>> matches_;
208  std::vector<std::unique_ptr<Capture>> captures_;
209 
210  HtmlTag tag_;
211  bool is_optional_;
212 };
213 
214 
215 } // namespace hext
216 
217 
218 #endif // HEXT_RULE_H_INCLUDED
219 
std::vector< ResultMap > Result
A vector containing ResultMap.
Definition: Result.h:45
Abstract base for every Match.
Definition: Match.h:41
All valid HTML tags.
Declares hext::Html.
Defines HEXT_PUBLIC and HEXT_PRIVATE.
Rule & append_match(Args &&...arg)
Emplaces a Match.
Definition: Rule.h:141
Any html tag.
Rule & append_capture(Args &&...arg)
Emplaces a Capture.
Definition: Rule.h:158
Extracts values from HTML.
Definition: Rule.h:86
Typedefs for results returned from capturing HTML.
Abstract base for every Capture.
Definition: Capture.h:44
#define HEXT_PUBLIC
Definition: Visibility.h:26
Declares hext::Match.
#define HEXT_PRIVATE
Definition: Visibility.h:27
HtmlTag
An enum containing all valid HTML tags.
Definition: HtmlTag.h:28
std::pair< std::string, std::string > ResultPair
A string-pair containing a name and a value.
Definition: Result.h:32
Declares hext::Capture.
A RAII wrapper for Gumbo.
Definition: Html.h:46