#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright (c) 2011 Daniel Gerber. # #This program is free software: you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation, either version 3 of the License, or #(at your option) any later version. # #This program is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with this program. If not, see . r""" Parsing and validation of URIs (RFC 3986) and IRIs (RFC 3987). This module provides regular expressions according to `RFC 3986 "Uniform Resource Identifier (URI): Generic Syntax" `_ and `RFC 3987 "Internationalized Resource Identifiers (IRIs)" `_, and utilities for composition and relative resolution of references. API --- **match** (string, rule='IRI_reference') {match.__doc__} **parse** (string, rule='IRI_reference') {parse.__doc__} **compose** (\*\*parts) {compose.__doc__} **resolve** (base, uriref, strict=True, return_parts=False) {resolve.__doc__} **patterns** A dict of regular expressions with useful group names. Compilable (with regex_ only) without need for any particular compilation flag. **[bmp_][u]patterns[_no_names]** Alternative versions of `patterns`. [u]nicode strings without group names for the re_ module. BMP only for narrow builds. **get_compiled_pattern** (rule, flags=0) {get_compiled_pattern.__doc__} **format_patterns** (\*\*names) {format_patterns.__doc__} Dependencies ------------ Some features require regex_. This package's docstrings are tested on python 2.6, 2.7, and 3.2 to 3.6. Note that in python<=3.2, characters beyond the Basic Multilingual Plane are not supported on narrow builds (see `issue12729 `_). Release notes ------------- version 1.3.8: - fixed deprecated escape sequence version 1.3.6: - fixed a bug in IPv6 pattern: >>> assert match('::0:0:0:0:0.0.0.0', 'IPv6address') version 1.3.4: - allowed for lower case percent encoding version 1.3.3: - fixed a bug in `resolve` which left "../" at the beginning of some paths version 1.3.2: - convenience function `match` - patterns restricted to the BMP for narrow builds - adapted doctests for python 3.3 - compatibility with python 2.6 (thanks to Thijs Janssen) version 1.3.1: - some re_ compatibility: get_compiled_pattern, parse - dropped regex_ from setup.py requirements version 1.3.0: - python 3.x compatibility - format_patterns version 1.2.1: - compose, resolve .. _re: http://docs.python.org/library/re .. _regex: http://pypi.python.org/pypi/regex Support ------- This is free software. You may show your appreciation with a `donation`_. .. _donation: http://danielgerber.net/ยค#Thanks-for-python-package-rfc3987 """ __version__ = '1.3.8' import sys as _sys NARROW_BUILD = _sys.maxunicode == 0xffff try: basestring except NameError: basestring = str try: import regex as _re REGEX = True except ImportError: import re as _re REGEX = False __all__ = ('get_compiled_pattern', 'parse', 'format_patterns', 'patterns', 'compose', 'resolve', 'match') _common_rules = ( ######## SCHEME ######## ('scheme', r"[a-zA-Z][a-zA-Z0-9+.-]*"), ######## PORT ######## ('port', r"[0-9]*"), ######## IP ADDRESSES ######## ('IP_literal', r"\[(?:{IPv6address}|{IPvFuture})\]"), ('IPv6address', (r"(?: (?:{h16}:){{6}} {ls32}" r"| :: (?:{h16}:){{5}} {ls32}" r"| (?: {h16})? :: (?:{h16}:){{4}} {ls32}" r"| (?:(?:{h16}:)? {h16})? :: (?:{h16}:){{3}} {ls32}" r"| (?:(?:{h16}:){{,2}}{h16})? :: (?:{h16}:){{2}} {ls32}" r"| (?:(?:{h16}:){{,3}}{h16})? :: (?:{h16}:) {ls32}" r"| (?:(?:{h16}:){{,4}}{h16})? :: {ls32}" r"| (?:(?:{h16}:){{,5}}{h16})? :: {h16} " r"| (?:(?:{h16}:){{,6}}{h16})? :: )" ).replace(' ', '')), ('ls32', r"(?:{h16}:{h16}|{IPv4address})"), ('h16', r"{hexdig}{{1,4}}"), ('IPv4address', r"(?:{dec_octet}\.){{3}}{dec_octet}"), ('dec_octet', r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"), ('IPvFuture', r"v{hexdig}+\.(?:{unreserved}|{sub_delims}|:)+"), ######## CHARACTER CLASSES ######## ('unreserved', r"[a-zA-Z0-9_.~-]"), ('reserved', r"(?:{gen_delims}|{sub_delims})"), ('pct_encoded', r"%{hexdig}{{2}}"), ('gen_delims', r"[:/?#[\]@]"), ('sub_delims', r"[!$&'()*+,;=]"), ('hexdig', r"[0-9A-Fa-f]"), ) _uri_rules = ( ######## REFERENCES ######## ('URI_reference', r"(?:{URI}|{relative_ref})"), ('URI', r"{absolute_URI}(?:\#{fragment})?"), ('absolute_URI', r"{scheme}:{hier_part}(?:\?{query})?"), ('relative_ref', r"{relative_part}(?:\?{query})?(?:\#{fragment})?"), ('hier_part', (r"(?://{authority}{path_abempty}" r"|{path_absolute}|{path_rootless}|{path_empty})")), ('relative_part', (r"(?://{authority}{path_abempty}" r"|{path_absolute}|{path_noscheme}|{path_empty})")), ######## AUTHORITY ######## ('authority', r"(?:{userinfo}@)?{host}(?::{port})?"), ('host', r"(?:{IP_literal}|{IPv4address}|{reg_name})"), ('userinfo', r"(?:{unreserved}|{pct_encoded}|{sub_delims}|:)*"), ('reg_name', r"(?:{unreserved}|{pct_encoded}|{sub_delims})*"), ######## PATH ######## ('path', (r"(?:{path_abempty}|{path_absolute}|{path_noscheme}" r"|{path_rootless}|{path_empty})")), ('path_abempty', r"(?:/{segment})*"), ('path_absolute', r"/(?:{segment_nz}(?:/{segment})*)?"), ('path_noscheme', r"{segment_nz_nc}(?:/{segment})*"), ('path_rootless', r"{segment_nz}(?:/{segment})*"), ('path_empty', r""), ('segment', r"{pchar}*"), ('segment_nz', r"{pchar}+"), ('segment_nz_nc', r"(?:{unreserved}|{pct_encoded}|{sub_delims}|@)+"), ######## QUERY ######## ('query', r"(?:{pchar}|/|\?)*"), ######## FRAGMENT ######## ('fragment', r"(?:{pchar}|/|\?)*"), ######## CHARACTER CLASSES ######## ('pchar', r"(?:{unreserved}|{pct_encoded}|{sub_delims}|:|@)"), ) #: http://tools.ietf.org/html/rfc3987 #: January 2005 _iri_rules = ( ######## REFERENCES ######## ('IRI_reference', r"(?:{IRI}|{irelative_ref})"), ('IRI', r"{absolute_IRI}(?:\#{ifragment})?"), ('absolute_IRI', r"{scheme}:{ihier_part}(?:\?{iquery})?"), ('irelative_ref', (r"(?:{irelative_part}" r"(?:\?{iquery})?(?:\#{ifragment})?)")), ('ihier_part', (r"(?://{iauthority}{ipath_abempty}" r"|{ipath_absolute}|{ipath_rootless}|{ipath_empty})")), ('irelative_part', (r"(?://{iauthority}{ipath_abempty}" r"|{ipath_absolute}|{ipath_noscheme}|{ipath_empty})")), ######## AUTHORITY ######## ('iauthority', r"(?:{iuserinfo}@)?{ihost}(?::{port})?"), ('iuserinfo', r"(?:{iunreserved}|{pct_encoded}|{sub_delims}|:)*"), ('ihost', r"(?:{IP_literal}|{IPv4address}|{ireg_name})"), ('ireg_name', r"(?:{iunreserved}|{pct_encoded}|{sub_delims})*"), ######## PATH ######## ('ipath', (r"(?:{ipath_abempty}|{ipath_absolute}|{ipath_noscheme}" r"|{ipath_rootless}|{ipath_empty})")), ('ipath_empty', r""), ('ipath_rootless', r"{isegment_nz}(?:/{isegment})*"), ('ipath_noscheme', r"{isegment_nz_nc}(?:/{isegment})*"), ('ipath_absolute', r"/(?:{isegment_nz}(?:/{isegment})*)?"), ('ipath_abempty', r"(?:/{isegment})*"), ('isegment_nz_nc', r"(?:{iunreserved}|{pct_encoded}|{sub_delims}|@)+"), ('isegment_nz', r"{ipchar}+"), ('isegment', r"{ipchar}*"), ######## QUERY ######## ('iquery', r"(?:{ipchar}|{iprivate}|/|\?)*"), ######## FRAGMENT ######## ('ifragment', r"(?:{ipchar}|/|\?)*"), ######## CHARACTER CLASSES ######## ('ipchar', r"(?:{iunreserved}|{pct_encoded}|{sub_delims}|:|@)"), ('iunreserved', r"(?:[a-zA-Z0-9._~-]|{ucschar})"), ('iprivate', r"[\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"), ('ucschar', (r"[\xA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF" r"\U00010000-\U0001FFFD\U00020000-\U0002FFFD" r"\U00030000-\U0003FFFD\U00040000-\U0004FFFD" r"\U00050000-\U0005FFFD\U00060000-\U0006FFFD" r"\U00070000-\U0007FFFD\U00080000-\U0008FFFD" r"\U00090000-\U0009FFFD\U000A0000-\U000AFFFD" r"\U000B0000-\U000BFFFD\U000C0000-\U000CFFFD" r"\U000D0000-\U000DFFFD\U000E1000-\U000EFFFD]")), ) def format_patterns(**names): r"""Returns a dict of patterns (regular expressions) keyed by `rule names for URIs`_ and `rule names for IRIs`_. See also the module level dicts of patterns, and `get_compiled_pattern`. To wrap a rule in a named capture group, pass it as keyword argument: rule_name='group_name'. By default, the formatted patterns contain no named groups. Patterns are `str` instances (be it in python 2.x or 3.x) containing ASCII characters only. Caveats: - with re_, named capture groups cannot occur on multiple branches of an alternation - with re_ before python 3.3, ``\u`` and ``\U`` escapes must be preprocessed (see `issue3665 `_) - on narrow builds, character ranges beyond BMP are not supported .. _rule names for URIs: http://tools.ietf.org/html/rfc3986#appendix-A .. _rule names for IRIs: http://tools.ietf.org/html/rfc3987#section-2.2 """ formatted = {} for name, pat in _common_rules[::-1] + _uri_rules[::-1] + _iri_rules[::-1]: if name in names: n = names[name] if callable(n): pat = n(pat) else: pat = '(?P<%s>%s)' % (n, pat) formatted[name] = pat.format(**formatted) return formatted _GROUP_NAMES_BASE = [ 'scheme', 'port', 'IPv6address', 'IPv4address', 'IPvFuture', 'URI_reference', 'URI', 'absolute_URI', 'relative_ref', 'relative_part', 'authority', 'host', 'userinfo', 'reg_name', 'query', 'fragment', 'IRI_reference', 'IRI', 'absolute_IRI', 'irelative_ref', 'irelative_part', 'iauthority', 'ihost', 'iuserinfo', 'ireg_name', 'iquery', 'ifragment' ] DEFAULT_GROUP_NAMES = dict(zip(_GROUP_NAMES_BASE,_GROUP_NAMES_BASE), path_abempty='path', path_absolute='path', path_noscheme='path', path_rootless='path', path_empty='path', ipath_abempty='ipath', ipath_absolute='ipath', ipath_noscheme='ipath', ipath_rootless='ipath', ipath_empty='ipath') #: mapping of rfc3986 / rfc3987 rule names to regular expressions patterns = format_patterns(**DEFAULT_GROUP_NAMES) def _interpret_unicode_escapes(string): return string.encode('ascii').decode('raw-unicode-escape') patterns_no_names = format_patterns() # if not REGEX: #: patterns compilable with re upatterns_no_names = dict((k, _interpret_unicode_escapes(v)) for k,v in format_patterns().items()) _bmp = lambda s: _re.sub(r'\\U[0-9A-F]{8}-\\U[0-9A-F]{8}', '', s) #: patterns restricted to the basic multilingual plane #: compilable on narrow build bmp_patterns = dict((k, _bmp(v)) for k,v in patterns.items()) #: compilable on narrow build with re bmp_upatterns_no_names = dict((k, _interpret_unicode_escapes(_bmp(v))) for k,v in patterns_no_names.items()) def get_compiled_pattern(rule, flags=0): """Returns a compiled pattern object for a rule name or template string. Usage for validation:: >>> uri = get_compiled_pattern('^%(URI)s$') >>> assert uri.match('http://tools.ietf.org/html/rfc3986#appendix-A') >>> assert not get_compiled_pattern('^%(relative_ref)s$').match('#f#g') >>> from unicodedata import lookup >>> smp = 'urn:' + lookup('OLD ITALIC LETTER A') # U+00010300 >>> assert not uri.match(smp) >>> m = get_compiled_pattern('^%(IRI)s$').match(smp) On narrow builds, non-BMP characters are (incorrectly) excluded:: >>> assert NARROW_BUILD == (not m) For parsing, some subcomponents are captured in named groups (*only if* regex_ is available, otherwise see `parse`):: >>> match = uri.match('http://tools.ietf.org/html/rfc3986#appendix-A') >>> d = match.groupdict() >>> if REGEX: ... assert all([ d['scheme'] == 'http', ... d['authority'] == 'tools.ietf.org', ... d['path'] == '/html/rfc3986', ... d['query'] == None, ... d['fragment'] == 'appendix-A' ]) >>> for r in patterns.keys(): ... assert get_compiled_pattern(r) """ cache, key = get_compiled_pattern.cache, (rule, flags) if key not in cache: if NARROW_BUILD: pats = bmp_patterns if REGEX else bmp_upatterns_no_names else: pats = patterns if REGEX else upatterns_no_names p = pats.get(rule) or rule % pats cache[key] = _re.compile(p, flags) return cache[key] get_compiled_pattern.cache = {} def match(string, rule='IRI_reference'): """Convenience function for checking if `string` matches a specific rule. Returns a match object or None:: >>> assert match('%C7X', 'pct_encoded') is None >>> assert match('%C7', 'pct_encoded') >>> assert match('%c7', 'pct_encoded') """ return get_compiled_pattern('^%%(%s)s$' % rule).match(string) #: http://tools.ietf.org/html/rfc3986#appendix-B _iri_non_validating_re = _re.compile( r"^((?P[^:/?#]+):)?(//(?P[^/?#]*))?" r"(?P[^?#]*)(\?(?P[^#]*))?(#(?P.*))?") REFERENCE_RULES = ('IRI_reference', 'IRI', 'absolute_IRI', 'irelative_ref', 'irelative_part', 'URI_reference', 'URI', 'absolute_URI', 'relative_ref', 'relative_part') def parse(string, rule='IRI_reference'): """Parses `string` according to `rule` into a dict of subcomponents. If `rule` is None, parse an IRI_reference `without validation `_. If regex_ is available, any rule is supported; with re_, `rule` must be 'IRI_reference' or some special case thereof ('IRI', 'absolute_IRI', 'irelative_ref', 'irelative_part', 'URI_reference', 'URI', 'absolute_URI', 'relative_ref', 'relative_part'). :: >>> d = parse('http://tools.ietf.org/html/rfc3986#appendix-A', ... rule='URI') >>> assert all([ d['scheme'] == 'http', ... d['authority'] == 'tools.ietf.org', ... d['path'] == '/html/rfc3986', ... d['query'] == None, ... d['fragment'] == 'appendix-A' ]) """ if not REGEX and rule and rule not in REFERENCE_RULES: raise ValueError(rule) if rule: m = match(string, rule) if not m: raise ValueError('%r is not a valid %r.' % (string, rule)) if REGEX: return _i2u(m.groupdict()) return _i2u(_iri_non_validating_re.match(string).groupdict()) def _i2u(dic): for (name, iname) in [('authority', 'iauthority'), ('path', 'ipath'), ('query', 'iquery'), ('fragment', 'ifragment')]: if dic.get(name) is None: dic[name] = dic.get(iname) return dic def compose(scheme=None, authority=None, path=None, query=None, fragment=None, iauthority=None, ipath=None, iquery=None, ifragment=None, **kw): """Returns an URI composed_ from named parts. .. _composed: http://tools.ietf.org/html/rfc3986#section-5.3 """ _i2u(locals()) res = '' if scheme is not None: res += scheme + ':' if authority is not None: res += '//' + authority res += path or '' if query is not None: res += '?' + query if fragment is not None: res += '#' + fragment return res _dot_segments = get_compiled_pattern(r'^(?:\.{1,2}(?:/|$))+|(?<=/)\.(?:/|$)') _2dots_segments = get_compiled_pattern(r'/?%(segment)s/\.{2}(?:/|$)') def _remove_dot_segments(path): path = _dot_segments.sub('', path) c = 1 while c: path, c = _2dots_segments.subn('/', path, 1) return path def resolve(base, uriref, strict=True, return_parts=False): """Resolves_ an `URI reference` relative to a `base` URI. `Test cases `_:: >>> base = resolve.test_cases_base >>> for relative, resolved in resolve.test_cases.items(): ... assert resolve(base, relative) == resolved If `return_parts` is True, returns a dict of named parts instead of a string. Examples:: >>> assert resolve('urn:rootless', '../../name') == 'urn:name' >>> assert resolve('urn:root/less', '../../name') == 'urn:/name' >>> assert resolve('http://a/b', 'http:g') == 'http:g' >>> assert resolve('http://a/b', 'http:g', strict=False) == 'http://a/g' .. _Resolves: http://tools.ietf.org/html/rfc3986#section-5.2 """ #base = normalize(base) if isinstance(base, basestring): B = parse(base, 'IRI') else: B = _i2u(dict(base)) if not B.get('scheme'): raise ValueError('Expected an IRI (with scheme), not %r.' % base) if isinstance(uriref, basestring): R = parse(uriref, 'IRI_reference') else: R = _i2u(dict(uriref)) # _last_segment = get_compiled_pattern(r'(?<=^|/)%(segment)s$') if R['scheme'] and (strict or R['scheme'] != B['scheme']): T = R else: T = {} T['scheme'] = B['scheme'] if R['authority'] is not None: T['authority'] = R['authority'] T['path'] = R['path'] T['query'] = R['query'] else: T['authority'] = B['authority'] if R['path']: if R['path'][:1] == "/": T['path'] = R['path'] elif B['authority'] is not None and not B['path']: T['path'] = '/%s' % R['path'] else: T['path'] = ''.join(B['path'].rpartition('/')[:2]) + R['path'] # _last_segment.sub(R['path'], B['path']) T['query'] = R['query'] else: T['path'] = B['path'] if R['query'] is not None: T['query'] = R['query'] else: T['query'] = B['query'] T['fragment'] = R['fragment'] T['path'] = _remove_dot_segments(T['path']) if return_parts: return T else: return compose(**T) resolve.test_cases_base = "http://a/b/c/d;p?q" resolve.test_cases = { "g:h" : "g:h", "g" : "http://a/b/c/g", "./g" : "http://a/b/c/g", "g/" : "http://a/b/c/g/", "/g" : "http://a/g", "//g" : "http://g", "?y" : "http://a/b/c/d;p?y", "g?y" : "http://a/b/c/g?y", "#s" : "http://a/b/c/d;p?q#s", "g#s" : "http://a/b/c/g#s", "g?y#s" : "http://a/b/c/g?y#s", ";x" : "http://a/b/c/;x", "g;x" : "http://a/b/c/g;x", "g;x?y#s" : "http://a/b/c/g;x?y#s", "" : "http://a/b/c/d;p?q", "." : "http://a/b/c/", "./" : "http://a/b/c/", ".." : "http://a/b/", "../" : "http://a/b/", "../g" : "http://a/b/g", "../.." : "http://a/", "../../" : "http://a/", "../../g" : "http://a/g", "../../../g" : "http://a/g", "../../../../g" : "http://a/g", "/./g" : "http://a/g", "/../g" : "http://a/g", "g." : "http://a/b/c/g.", ".g" : "http://a/b/c/.g", "g.." : "http://a/b/c/g..", "..g" : "http://a/b/c/..g", "./../g" : "http://a/b/g", "./g/." : "http://a/b/c/g/", "g/./h" : "http://a/b/c/g/h", "g/../h" : "http://a/b/c/h", "g;x=1/./y" : "http://a/b/c/g;x=1/y", "g;x=1/../y" : "http://a/b/c/y", "g?y/./x" : "http://a/b/c/g?y/./x", "g?y/../x" : "http://a/b/c/g?y/../x", "g#s/./x" : "http://a/b/c/g#s/./x", "g#s/../x" : "http://a/b/c/g#s/../x", } def normalize(uri): "Syntax-Based Normalization" # TODO: raise NotImplementedError if __name__ == '__main__': if not _sys.argv[1:]: print('Valid arguments are "--all" or rule names from:') print(' '.join(sorted(patterns))) elif _sys.argv[1] == '--all': for name in patterns: print(name + ':') print(patterns[name]) else: for name in _sys.argv[1:]: print(patterns[name])