/* SPDX-License-Identifier: MIT */ /* Copyright © 2022 Max Bachmann */ #pragma once #include "details/common.hpp" #include "details/jaro_impl.hpp" #include namespace duckdb_jaro_winkler { /** * @defgroup jaro_winkler jaro_winkler * @{ */ /** * @brief Calculates the jaro winkler similarity * * @tparam Sentence1 This is a string that can be converted to * basic_string_view * @tparam Sentence2 This is a string that can be converted to * basic_string_view * * @param s1 * string to compare with s2 (for type info check Template parameters above) * @param s2 * string to compare with s1 (for type info check Template parameters above) * @param prefix_weight * Weight used for the common prefix of the two strings. * Has to be between 0 and 0.25. Default is 0.1. * @param score_cutoff * Optional argument for a score threshold as a float between 0 and 100. * For similarity < score_cutoff 0 is returned instead. Default is 0, * which deactivates this behaviour. * * @return jaro winkler similarity between s1 and s2 * as a float between 0 and 100 */ template typename std::enable_if< common::is_iterator::value && common::is_iterator::value, double>::type jaro_winkler_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double prefix_weight = 0.1, double score_cutoff = 0.0) { if (prefix_weight < 0.0 || prefix_weight > 0.25) { throw std::invalid_argument("prefix_weight has to be between 0.0 and 0.25"); } return detail::jaro_winkler_similarity(first1, last1, first2, last2, prefix_weight, score_cutoff); } template double jaro_winkler_similarity(const S1& s1, const S2& s2, double prefix_weight = 0.1, double score_cutoff = 0.0) { return jaro_winkler_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), prefix_weight, score_cutoff); } template struct CachedJaroWinklerSimilarity { template CachedJaroWinklerSimilarity(InputIt1 first1, InputIt1 last1, double prefix_weight_ = 0.1) : s1(first1, last1), PM(first1, last1), prefix_weight(prefix_weight_) { if (prefix_weight < 0.0 || prefix_weight > 0.25) { throw std::invalid_argument("prefix_weight has to be between 0.0 and 0.25"); } } template CachedJaroWinklerSimilarity(const S1& s1_, double prefix_weight_ = 0.1) : CachedJaroWinklerSimilarity(std::begin(s1_), std::end(s1_), prefix_weight_) {} template double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const { return detail::jaro_winkler_similarity(PM, std::begin(s1), std::end(s1), first2, last2, prefix_weight, score_cutoff); } template double similarity(const S2& s2, double score_cutoff = 0) const { return similarity(std::begin(s2), std::end(s2), score_cutoff); } template double normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const { return similarity(first2, last2, score_cutoff); } template double normalized_similarity(const S2& s2, double score_cutoff = 0) const { return similarity(s2, score_cutoff); } private: std::basic_string s1; common::BlockPatternMatchVector PM; double prefix_weight; }; /** * @brief Calculates the jaro similarity * * @tparam Sentence1 This is a string that can be converted to * basic_string_view * @tparam Sentence2 This is a string that can be converted to * basic_string_view * * @param s1 * string to compare with s2 (for type info check Template parameters above) * @param s2 * string to compare with s1 (for type info check Template parameters above) * @param score_cutoff * Optional argument for a score threshold as a float between 0 and 100. * For similarity < score_cutoff 0 is returned instead. Default is 0, * which deactivates this behaviour. * * @return jaro similarity between s1 and s2 * as a float between 0 and 100 */ template double jaro_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0) { return detail::jaro_similarity(first1, last1, first2, last2, score_cutoff); } template double jaro_similarity(const S1& s1, const S2& s2, double score_cutoff = 0.0) { return jaro_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), score_cutoff); } template struct CachedJaroSimilarity { template CachedJaroSimilarity(InputIt1 first1, InputIt1 last1) : s1(first1, last1), PM(first1, last1) {} template CachedJaroSimilarity(const S1& s1_) : CachedJaroSimilarity(std::begin(s1_), std::end(s1_)) {} template double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const { return detail::jaro_similarity(PM, std::begin(s1), std::end(s1), first2, last2, score_cutoff); } template double similarity(const S2& s2, double score_cutoff = 0) const { return similarity(std::begin(s2), std::end(s2), score_cutoff); } template double normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const { return similarity(first2, last2, score_cutoff); } template double normalized_similarity(const S2& s2, double score_cutoff = 0) const { return similarity(s2, score_cutoff); } private: std::basic_string s1; common::BlockPatternMatchVector PM; }; /**@}*/ } // namespace duckdb_jaro_winkler