// Copyright 2009 The RE2 Authors. All Rights Reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #ifndef RE2_PREFILTER_TREE_H_ #define RE2_PREFILTER_TREE_H_ // The PrefilterTree class is used to form an AND-OR tree of strings // that would trigger each regexp. The 'prefilter' of each regexp is // added to PrefilterTree, and then PrefilterTree is used to find all // the unique strings across the prefilters. During search, by using // matches from a string matching engine, PrefilterTree deduces the // set of regexps that are to be triggered. The 'string matching // engine' itself is outside of this class, and the caller can use any // favorite engine. PrefilterTree provides a set of strings (called // atoms) that the user of this class should use to do the string // matching. #include #include #include #include "util/util.h" #include "re2/prefilter.h" #include "re2/sparse_array.h" namespace re2 { class PrefilterTree { public: PrefilterTree(); explicit PrefilterTree(int min_atom_len); ~PrefilterTree(); // Adds the prefilter for the next regexp. Note that we assume that // Add called sequentially for all regexps. All Add calls // must precede Compile. void Add(Prefilter* prefilter); // The Compile returns a vector of string in atom_vec. // Call this after all the prefilters are added through Add. // No calls to Add after Compile are allowed. // The caller should use the returned set of strings to do string matching. // Each time a string matches, the corresponding index then has to be // and passed to RegexpsGivenStrings below. void Compile(std::vector* atom_vec); // Given the indices of the atoms that matched, returns the indexes // of regexps that should be searched. The matched_atoms should // contain all the ids of string atoms that were found to match the // content. The caller can use any string match engine to perform // this function. This function is thread safe. void RegexpsGivenStrings(const std::vector& matched_atoms, std::vector* regexps) const; // Print debug prefilter. Also prints unique ids associated with // nodes of the prefilter of the regexp. void PrintPrefilter(int regexpid); private: typedef SparseArray IntMap; typedef std::map StdIntMap; typedef std::map NodeMap; // Each unique node has a corresponding Entry that helps in // passing the matching trigger information along the tree. struct Entry { public: // How many children should match before this node triggers the // parent. For an atom and an OR node, this is 1 and for an AND // node, it is the number of unique children. int propagate_up_at_count; // When this node is ready to trigger the parent, what are the indices // of the parent nodes to trigger. The reason there may be more than // one is because of sharing. For example (abc | def) and (xyz | def) // are two different nodes, but they share the atom 'def'. So when // 'def' matches, it triggers two parents, corresponding to the two // different OR nodes. StdIntMap* parents; // When this node is ready to trigger the parent, what are the // regexps that are triggered. std::vector regexps; }; // Returns true if the prefilter node should be kept. bool KeepNode(Prefilter* node) const; // This function assigns unique ids to various parts of the // prefilter, by looking at if these nodes are already in the // PrefilterTree. void AssignUniqueIds(NodeMap* nodes, std::vector* atom_vec); // Given the matching atoms, find the regexps to be triggered. void PropagateMatch(const std::vector& atom_ids, IntMap* regexps) const; // Returns the prefilter node that has the same NodeString as this // node. For the canonical node, returns node. Prefilter* CanonicalNode(NodeMap* nodes, Prefilter* node); // A string that uniquely identifies the node. Assumes that the // children of node has already been assigned unique ids. std::string NodeString(Prefilter* node) const; // Recursively constructs a readable prefilter string. std::string DebugNodeString(Prefilter* node) const; // Used for debugging. void PrintDebugInfo(NodeMap* nodes); // These are all the nodes formed by Compile. Essentially, there is // one node for each unique atom and each unique AND/OR node. std::vector entries_; // indices of regexps that always pass through the filter (since we // found no required literals in these regexps). std::vector unfiltered_; // vector of Prefilter for all regexps. std::vector prefilter_vec_; // Atom index in returned strings to entry id mapping. std::vector atom_index_to_id_; // Has the prefilter tree been compiled. bool compiled_; // Strings less than this length are not stored as atoms. const int min_atom_len_; PrefilterTree(const PrefilterTree&) = delete; PrefilterTree& operator=(const PrefilterTree&) = delete; }; } // namespace #endif // RE2_PREFILTER_TREE_H_