/** * @file url_pattern.h * @brief URLPattern API implementation. * * This header provides the URLPattern API as specified by the WHATWG URL * Pattern Standard. URLPattern allows matching URLs against patterns with * wildcards and named groups, similar to how regular expressions match strings. * * @see https://urlpattern.spec.whatwg.org/ * @see https://developer.mozilla.org/en-US/docs/Web/API/URL_Pattern_API */ #ifndef ADA_URL_PATTERN_H #define ADA_URL_PATTERN_H #include "ada/implementation.h" #include "ada/expected.h" #include "ada/parser.h" #include "ada/url_pattern_init.h" #include #include #include #include #include #include #if ADA_TESTING #include #endif // ADA_TESTING #if ADA_INCLUDE_URL_PATTERN namespace ada { enum class url_pattern_part_type : uint8_t { // The part represents a simple fixed text string. FIXED_TEXT, // The part represents a matching group with a custom regular expression. REGEXP, // The part represents a matching group that matches code points up to the // next separator code point. This is typically used for a named group like // ":foo" that does not have a custom regular expression. SEGMENT_WILDCARD, // The part represents a matching group that greedily matches all code points. // This is typically used for the "*" wildcard matching group. FULL_WILDCARD, }; // Pattern type for fast-path matching optimization. // This allows skipping expensive regex evaluation for common simple patterns. enum class url_pattern_component_type : uint8_t { // Pattern is "^$" - only matches empty string EMPTY, // Pattern is "^$" - exact string match (no regex needed) EXACT_MATCH, // Pattern is "^(.*)$" - matches anything (full wildcard) FULL_WILDCARD, // Pattern requires actual regex evaluation REGEXP, }; enum class url_pattern_part_modifier : uint8_t { // The part does not have a modifier. none, // The part has an optional modifier indicated by the U+003F (?) code point. optional, // The part has a "zero or more" modifier indicated by the U+002A (*) code // point. zero_or_more, // The part has a "one or more" modifier indicated by the U+002B (+) code // point. one_or_more, }; // @see https://urlpattern.spec.whatwg.org/#part class url_pattern_part { public: url_pattern_part(url_pattern_part_type _type, std::string&& _value, url_pattern_part_modifier _modifier) : type(_type), value(std::move(_value)), modifier(_modifier) {} url_pattern_part(url_pattern_part_type _type, std::string&& _value, url_pattern_part_modifier _modifier, std::string&& _name, std::string&& _prefix, std::string&& _suffix) : type(_type), value(std::move(_value)), modifier(_modifier), name(std::move(_name)), prefix(std::move(_prefix)), suffix(std::move(_suffix)) {} // A part has an associated type, a string, which must be set upon creation. url_pattern_part_type type; // A part has an associated value, a string, which must be set upon creation. std::string value; // A part has an associated modifier a string, which must be set upon // creation. url_pattern_part_modifier modifier; // A part has an associated name, a string, initially the empty string. std::string name{}; // A part has an associated prefix, a string, initially the empty string. std::string prefix{}; // A part has an associated suffix, a string, initially the empty string. std::string suffix{}; inline bool is_regexp() const noexcept; }; // @see https://urlpattern.spec.whatwg.org/#options-header struct url_pattern_compile_component_options { url_pattern_compile_component_options() = default; explicit url_pattern_compile_component_options( std::optional new_delimiter = std::nullopt, std::optional new_prefix = std::nullopt) : delimiter(new_delimiter), prefix(new_prefix) {} inline std::string_view get_delimiter() const ada_warn_unused; inline std::string_view get_prefix() const ada_warn_unused; // @see https://urlpattern.spec.whatwg.org/#options-ignore-case bool ignore_case = false; static url_pattern_compile_component_options DEFAULT; static url_pattern_compile_component_options HOSTNAME; static url_pattern_compile_component_options PATHNAME; private: // @see https://urlpattern.spec.whatwg.org/#options-delimiter-code-point std::optional delimiter{}; // @see https://urlpattern.spec.whatwg.org/#options-prefix-code-point std::optional prefix{}; }; // The default options is an options struct with delimiter code point set to // the empty string and prefix code point set to the empty string. inline url_pattern_compile_component_options url_pattern_compile_component_options::DEFAULT(std::nullopt, std::nullopt); // The hostname options is an options struct with delimiter code point set // "." and prefix code point set to the empty string. inline url_pattern_compile_component_options url_pattern_compile_component_options::HOSTNAME('.', std::nullopt); // The pathname options is an options struct with delimiter code point set // "/" and prefix code point set to "/". inline url_pattern_compile_component_options url_pattern_compile_component_options::PATHNAME('/', '/'); // A struct providing the URLPattern matching results for a single // URL component. The URLPatternComponentResult is only ever used // as a member attribute of a URLPatternResult struct. The // URLPatternComponentResult API is defined as part of the URLPattern // specification. struct url_pattern_component_result { std::string input; std::unordered_map> groups; bool operator==(const url_pattern_component_result&) const; #if ADA_TESTING friend void PrintTo(const url_pattern_component_result& result, std::ostream* os) { *os << "input: '" << result.input << "', group: "; for (const auto& group : result.groups) { *os << "(" << group.first << ", " << group.second.value_or("undefined") << ") "; } } #endif // ADA_TESTING }; template class url_pattern_component { public: url_pattern_component() = default; // This function explicitly takes a std::string because it is moved. // To avoid unnecessary copy, move each value while calling the constructor. url_pattern_component(std::string&& new_pattern, typename regex_provider::regex_type&& new_regexp, std::vector&& new_group_name_list, bool new_has_regexp_groups, url_pattern_component_type new_type, std::string&& new_exact_match_value = {}) : regexp(std::move(new_regexp)), pattern(std::move(new_pattern)), group_name_list(std::move(new_group_name_list)), exact_match_value(std::move(new_exact_match_value)), has_regexp_groups(new_has_regexp_groups), type(new_type) {} // @see https://urlpattern.spec.whatwg.org/#compile-a-component template static tl::expected compile( std::string_view input, F& encoding_callback, url_pattern_compile_component_options& options); // @see https://urlpattern.spec.whatwg.org/#create-a-component-match-result url_pattern_component_result create_component_match_result( std::string&& input, std::vector>&& exec_result); // Fast path test that returns true/false without constructing result groups. // Uses cached pattern type to skip regex evaluation for simple patterns. bool fast_test(std::string_view input) const noexcept; // Fast path match that returns capture groups without regex for simple // patterns. Returns nullopt if pattern doesn't match, otherwise returns // capture groups. std::optional>> fast_match( std::string_view input) const; #if ADA_TESTING friend void PrintTo(const url_pattern_component& component, std::ostream* os) { *os << "pattern: '" << component.pattern << "', has_regexp_groups: " << component.has_regexp_groups << "group_name_list: "; for (const auto& name : component.group_name_list) { *os << name << ", "; } } #endif // ADA_TESTING typename regex_provider::regex_type regexp{}; std::string pattern{}; std::vector group_name_list{}; // For EXACT_MATCH type: the literal string to compare against std::string exact_match_value{}; bool has_regexp_groups = false; // Cached pattern type for fast-path optimization url_pattern_component_type type = url_pattern_component_type::REGEXP; }; // A URLPattern input can be either a string or a URLPatternInit object. // If it is a string, it must be a valid UTF-8 string. using url_pattern_input = std::variant; // A struct providing the URLPattern matching results for all // components of a URL. The URLPatternResult API is defined as // part of the URLPattern specification. struct url_pattern_result { std::vector inputs; url_pattern_component_result protocol; url_pattern_component_result username; url_pattern_component_result password; url_pattern_component_result hostname; url_pattern_component_result port; url_pattern_component_result pathname; url_pattern_component_result search; url_pattern_component_result hash; }; struct url_pattern_options { bool ignore_case = false; #if ADA_TESTING friend void PrintTo(const url_pattern_options& options, std::ostream* os) { *os << "ignore_case: '" << options.ignore_case; } #endif // ADA_TESTING }; /** * @brief URL pattern matching class implementing the URLPattern API. * * URLPattern provides a way to match URLs against patterns with wildcards * and named capture groups. It's useful for routing, URL-based dispatching, * and URL validation. * * Pattern syntax supports: * - Literal text matching * - Named groups: `:name` (matches up to the next separator) * - Wildcards: `*` (matches everything) * - Custom regex: `(pattern)` * - Optional segments: `:name?` * - Repeated segments: `:name+`, `:name*` * * @tparam regex_provider The regex implementation to use for pattern matching. * Must satisfy the url_pattern_regex::regex_concept. * * @note All string inputs must be valid UTF-8. * * @see https://urlpattern.spec.whatwg.org/ */ template class url_pattern { public: url_pattern() = default; /** * If non-null, base_url must pointer at a valid UTF-8 string. * @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-exec */ result> exec( const url_pattern_input& input, const std::string_view* base_url = nullptr); /** * If non-null, base_url must pointer at a valid UTF-8 string. * @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-test */ result test(const url_pattern_input& input, const std::string_view* base_url = nullptr); /** * @see https://urlpattern.spec.whatwg.org/#url-pattern-match * This function expects a valid UTF-8 string if input is a string. */ result> match( const url_pattern_input& input, const std::string_view* base_url_string = nullptr); // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-protocol [[nodiscard]] std::string_view get_protocol() const ada_lifetime_bound; // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-username [[nodiscard]] std::string_view get_username() const ada_lifetime_bound; // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-password [[nodiscard]] std::string_view get_password() const ada_lifetime_bound; // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-hostname [[nodiscard]] std::string_view get_hostname() const ada_lifetime_bound; // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-port [[nodiscard]] std::string_view get_port() const ada_lifetime_bound; // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-pathname [[nodiscard]] std::string_view get_pathname() const ada_lifetime_bound; // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-search [[nodiscard]] std::string_view get_search() const ada_lifetime_bound; // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-hash [[nodiscard]] std::string_view get_hash() const ada_lifetime_bound; // If ignoreCase is true, the JavaScript regular expression created for each // pattern must use the `vi` flag. Otherwise, they must use the `v` flag. [[nodiscard]] bool ignore_case() const; // @see https://urlpattern.spec.whatwg.org/#url-pattern-has-regexp-groups [[nodiscard]] bool has_regexp_groups() const; // Helper to test all components at once. Returns true if all match. [[nodiscard]] bool test_components( std::string_view protocol, std::string_view username, std::string_view password, std::string_view hostname, std::string_view port, std::string_view pathname, std::string_view search, std::string_view hash) const; #if ADA_TESTING friend void PrintTo(const url_pattern& c, std::ostream* os) { *os << "protocol_component: '" << c.get_protocol() << ", "; *os << "username_component: '" << c.get_username() << ", "; *os << "password_component: '" << c.get_password() << ", "; *os << "hostname_component: '" << c.get_hostname() << ", "; *os << "port_component: '" << c.get_port() << ", "; *os << "pathname_component: '" << c.get_pathname() << ", "; *os << "search_component: '" << c.get_search() << ", "; *os << "hash_component: '" << c.get_hash(); } #endif // ADA_TESTING template friend tl::expected, errors> parser::parse_url_pattern_impl( std::variant&& input, const std::string_view* base_url, const url_pattern_options* options); /** * @private * We can not make this private due to a LLVM bug. * Ref: https://github.com/ada-url/ada/pull/859 */ url_pattern_component protocol_component{}; /** * @private * We can not make this private due to a LLVM bug. * Ref: https://github.com/ada-url/ada/pull/859 */ url_pattern_component username_component{}; /** * @private * We can not make this private due to a LLVM bug. * Ref: https://github.com/ada-url/ada/pull/859 */ url_pattern_component password_component{}; /** * @private * We can not make this private due to a LLVM bug. * Ref: https://github.com/ada-url/ada/pull/859 */ url_pattern_component hostname_component{}; /** * @private * We can not make this private due to a LLVM bug. * Ref: https://github.com/ada-url/ada/pull/859 */ url_pattern_component port_component{}; /** * @private * We can not make this private due to a LLVM bug. * Ref: https://github.com/ada-url/ada/pull/859 */ url_pattern_component pathname_component{}; /** * @private * We can not make this private due to a LLVM bug. * Ref: https://github.com/ada-url/ada/pull/859 */ url_pattern_component search_component{}; /** * @private * We can not make this private due to a LLVM bug. * Ref: https://github.com/ada-url/ada/pull/859 */ url_pattern_component hash_component{}; /** * @private * We can not make this private due to a LLVM bug. * Ref: https://github.com/ada-url/ada/pull/859 */ bool ignore_case_ = false; }; } // namespace ada #endif // ADA_INCLUDE_URL_PATTERN #endif