Module `freshlinks.canonicalize`

Generate canonical URLs roughly using Google's Safe Browsing standard.

Differences:

Fragments are removed, but #! is converted to ?_escaped_fragment_=
Username and password are NOT removed from the host.

See:

Expand source code

"""Generate canonical URLs roughly using Google's Safe Browsing standard.

Differences:

- Fragments are removed, but `#!` is converted to `?_escaped_fragment_=`
- Username and password are NOT removed from the host.

See:

- [URLs and Hashing: Canonicalization](https://developers.google.com/safe-browsing/v4/urls-hashing#canonicalization)
- [`expression.py`](https://chromium.googlesource.com/external/google-safe-browsing/+/06a8c4e799233da220ad7411e2bfacc74cbfbb37/python/expression.py)
- [`url_normalize.py`](https://github.com/niksite/url-normalize/blob/master/url_normalize/url_normalize.py)
"""

# This file is a rewrite of Google's Python 2.5 implementation.
#
# Copyright 2023 Metaist LLC.
# Licensed under the MIT License.
#
# The original `expression.py` file carries this license:
#
# Copyright 2010 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# std
from typing import List
from typing import Union
from typing import cast
from urllib.parse import quote
from urllib.parse import unquote
from urllib.parse import unquote_to_bytes
import re
import string

# lib
from attrbox import AttrDict
from url_normalize.tools import deconstruct_url  # type: ignore
from url_normalize.tools import reconstruct_url
from url_normalize.url_normalize import normalize_port  # type: ignore
from url_normalize.url_normalize import normalize_query
from url_normalize.url_normalize import normalize_userinfo
from url_normalize.url_normalize import provide_url_scheme

DEFAULT_SCHEME = "http"
"""Default scheme that browsers uses."""

SAFE_CHARS = "".join(
    c
    for c in string.digits + string.ascii_letters + string.punctuation
    if c not in "%#"
)
"""Characters that are not escaped."""

IP_WITH_TRAILING_SPACE = re.compile(r"^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) ")
POSSIBLE_IP = re.compile(r"^((?:0x[0-9a-f]+|[0-9\\.])+)$", flags=re.I)
FIND_BAD_OCTAL_REGEXP = re.compile(r"(^|\.)0\d*[89]")

HEX = re.compile(r"^0x([a-fA-F0-9]+)$")
OCT = re.compile(r"^0([0-7]+)$")
DEC = re.compile(r"^(\d+)$")


def unquote2(string: Union[str, bytes]) -> Union[str, bytes]:
    """Unquote both strings and bytes."""
    if isinstance(string, bytes):
        return unquote_to_bytes(string)
    return unquote(string)


def escape(string: Union[str, bytes]) -> str:
    """Fully escape `string`, then re-escape once."""
    # See: https://chromium.googlesource.com/external/google-safe-browsing/+/06a8c4e799233da220ad7411e2bfacc74cbfbb37/python/expression.py#292
    unquoted = unquote2(string)
    while unquoted != string:
        string = unquoted
        unquoted = unquote2(unquoted)
    return quote(unquoted, SAFE_CHARS)


def canonical_ip(host: str) -> str:
    """Return a canonical IP address."""
    if len(host) <= 15:
        # This handles the Windows resolver allows an IP
        # followed by a space and something else as long as it
        # is under 15 characters.
        if m := IP_WITH_TRAILING_SPACE.match(host):
            host = m.group(1)

    if not POSSIBLE_IP.match(host):
        return ""

    # Try to parse octal, if possible.
    allow_octal = not FIND_BAD_OCTAL_REGEXP.search(host)

    # Skip trailing, leading and consecutive dots.
    parts = [part for part in host.split(".") if part]
    if len(parts) > 4:
        return ""

    ip: List[str] = []
    for i, part in enumerate(parts):
        if m := HEX.match(part):
            base = 16
        elif allow_octal and (m := OCT.match(part)):
            base = 8
        elif m := DEC.match(part):
            base = 10
        else:
            return ""

        # print("part:", part, "m:", m.group(1), "base:", base)
        n = int(m.group(1), base)
        if n <= 255:
            ip.append(str(n))
            continue

        # print("n > 255:", n)
        if i < len(parts) - 1:
            n &= 0xFF
            ip.append(str(n))
        else:
            bar = bytearray()
            while n > 0 and len(bar) < 4:
                bar.append(n & 0xFF)
                n >>= 8

            if len(ip) + len(bar) > 4:
                return ""

            bar.reverse()
            ip.extend(str(b) for b in bar)

    return ".".join((ip + (["0"] * 4))[:4])


def canonical_host(host: str) -> str:
    """Return a canonical hostname."""
    # 0: IDN host names should be converted to ASCII punycode
    # 1: Remove all leading and trailing dots.
    # 2: Replace consecutive dots with a single dot.
    # 3: If the hostname can be parsed as an IP address, normalize it to
    #    4 dot-separated decimal values. The client should handle any legal IP-address
    #    encoding, including octal, hex, and fewer than four components.
    # 4: Lowercase the whole string.

    # See: https://chromium.googlesource.com/external/google-safe-browsing/+/06a8c4e799233da220ad7411e2bfacc74cbfbb37/python/expression.py#207
    if not host:
        return ""

    result = host.lower()  # Rule 4
    result = ".".join([part for part in result.split(".") if part])  # Rule 1 & 2
    result = result.encode("idna").decode("utf-8")  # Rule 0

    if ip := canonical_ip(host):  # Rule 3
        return ip

    return result


def canonical_path(path: str) -> str:
    """Return a canonical path."""
    # 1: Resolve the sequences "/../" and "/./" in the path by replacing "/./" with "/",
    #    and removing "/../" along with the preceding path component.
    # 2: Replace runs of consecutive slashes with a single slash character.

    # See: https://chromium.googlesource.com/external/google-safe-browsing/+/06a8c4e799233da220ad7411e2bfacc74cbfbb37/python/expression.py#157
    result = path
    if not result:
        return "/"

    if result[0] != "/":
        result = f"/{result}"

    result = escape(result)
    parts: List[str] = []
    for part in result.split("/"):
        if part == "..":  # remove previous part (if any)
            if len(parts) > 0:
                parts.pop()
        elif part and part != ".":  # skip empty and .
            parts.append(part)

    result = f"/{'/'.join(parts)}"
    if path.endswith("/") and not result.endswith("/"):
        result = f"{result}/"
    # leading and trailing slashes added (if needed)

    # SPECIAL CASE: Handle URLs tacked on.
    result = result.replace("http:/", "http://").replace("https:/", "https://")
    return result


def canonical_url(url: Union[str, bytes]) -> str:
    """Return a canonical version of `url`."""
    # See: https://developers.google.com/safe-browsing/v4/urls-hashing#canonicalization
    # See: https://chromium.googlesource.com/external/google-safe-browsing/+/06a8c4e799233da220ad7411e2bfacc74cbfbb37/python/expression.py
    # 4: Remove tab (0x09), CR (0x0d), and LF (0x0a) characters from the URL.
    #    Do not remove escape sequences for these characters (e.g. '%0a').

    # 1: URL is valid RFC 2396
    # 2: Convert internationalized domain name (IDN) to ASCII Punycode.
    # 3: URL must include a path.
    # 5: Remove the fragment.
    # 6: Repeatedly percent-unescape the URL until it has no more percent-escapes.
    url = url.strip()  # no leading or trailing whitespace

    has_end_q = False
    if isinstance(url, bytes):
        url = url.replace(b"\t", b"").replace(b"\r", b"").replace(b"\n", b"")  # Rule 4
        url = url.replace(b"#!", b"?_escaped_fragment_=")  # Different than Google
        if (pos := url.find(b"#")) >= 0:
            url = url[0:pos]  # Rule 5
        has_end_q = url.endswith(b"?")
    else:
        url = url.replace("\t", "").replace("\r", "").replace("\n", "")  # Rule 4
        url = url.replace("#!", "?_escaped_fragment_=")  # Different than Google
        if (pos := url.find("#")) >= 0:
            url = url[0:pos]  # Rule 5
        has_end_q = url.endswith("?")

    url = escape(url)  # in case, e.g., scheme has encoded characters
    if url.startswith(":") and len(url) > 1:
        url = f"localhost{url}"
    url = provide_url_scheme(url, DEFAULT_SCHEME)

    parts = AttrDict(deconstruct_url(url)._asdict())
    if not parts.host:
        return ""

    parts.scheme = parts.scheme.lower() if parts.scheme else DEFAULT_SCHEME
    parts.userinfo = normalize_userinfo(parts.userinfo)
    parts.host = canonical_host(parts.host)
    parts.port = normalize_port(parts.port, parts.scheme)
    parts.path = canonical_path(cast(str, parts.path))
    parts.query = normalize_query(parts.query)
    parts.fragment = ""

    result = str(reconstruct_url(parts))
    if has_end_q and not result.endswith("?"):
        result += "?"

    return result

Global variables

var DEFAULT_SCHEME: Default scheme that browsers uses.
var SAFE_CHARS: Characters that are not escaped.

Functions

def unquote2(string: Union[str, bytes]) ‑> Union[str, bytes]

Unquote both strings and bytes.

Expand source code

def unquote2(string: Union[str, bytes]) -> Union[str, bytes]:
    """Unquote both strings and bytes."""
    if isinstance(string, bytes):
        return unquote_to_bytes(string)
    return unquote(string)

def escape(string: Union[str, bytes]) ‑> str

Fully escape string, then re-escape once.

Expand source code

def escape(string: Union[str, bytes]) -> str:
    """Fully escape `string`, then re-escape once."""
    # See: https://chromium.googlesource.com/external/google-safe-browsing/+/06a8c4e799233da220ad7411e2bfacc74cbfbb37/python/expression.py#292
    unquoted = unquote2(string)
    while unquoted != string:
        string = unquoted
        unquoted = unquote2(unquoted)
    return quote(unquoted, SAFE_CHARS)

def canonical_ip(host: str) ‑> str

Return a canonical IP address.

Expand source code

def canonical_ip(host: str) -> str:
    """Return a canonical IP address."""
    if len(host) <= 15:
        # This handles the Windows resolver allows an IP
        # followed by a space and something else as long as it
        # is under 15 characters.
        if m := IP_WITH_TRAILING_SPACE.match(host):
            host = m.group(1)

    if not POSSIBLE_IP.match(host):
        return ""

    # Try to parse octal, if possible.
    allow_octal = not FIND_BAD_OCTAL_REGEXP.search(host)

    # Skip trailing, leading and consecutive dots.
    parts = [part for part in host.split(".") if part]
    if len(parts) > 4:
        return ""

    ip: List[str] = []
    for i, part in enumerate(parts):
        if m := HEX.match(part):
            base = 16
        elif allow_octal and (m := OCT.match(part)):
            base = 8
        elif m := DEC.match(part):
            base = 10
        else:
            return ""

        # print("part:", part, "m:", m.group(1), "base:", base)
        n = int(m.group(1), base)
        if n <= 255:
            ip.append(str(n))
            continue

        # print("n > 255:", n)
        if i < len(parts) - 1:
            n &= 0xFF
            ip.append(str(n))
        else:
            bar = bytearray()
            while n > 0 and len(bar) < 4:
                bar.append(n & 0xFF)
                n >>= 8

            if len(ip) + len(bar) > 4:
                return ""

            bar.reverse()
            ip.extend(str(b) for b in bar)

    return ".".join((ip + (["0"] * 4))[:4])

def canonical_host(host: str) ‑> str

Return a canonical hostname.

Expand source code

def canonical_host(host: str) -> str:
    """Return a canonical hostname."""
    # 0: IDN host names should be converted to ASCII punycode
    # 1: Remove all leading and trailing dots.
    # 2: Replace consecutive dots with a single dot.
    # 3: If the hostname can be parsed as an IP address, normalize it to
    #    4 dot-separated decimal values. The client should handle any legal IP-address
    #    encoding, including octal, hex, and fewer than four components.
    # 4: Lowercase the whole string.

    # See: https://chromium.googlesource.com/external/google-safe-browsing/+/06a8c4e799233da220ad7411e2bfacc74cbfbb37/python/expression.py#207
    if not host:
        return ""

    result = host.lower()  # Rule 4
    result = ".".join([part for part in result.split(".") if part])  # Rule 1 & 2
    result = result.encode("idna").decode("utf-8")  # Rule 0

    if ip := canonical_ip(host):  # Rule 3
        return ip

    return result

def canonical_path(path: str) ‑> str

Return a canonical path.

Expand source code

def canonical_path(path: str) -> str:
    """Return a canonical path."""
    # 1: Resolve the sequences "/../" and "/./" in the path by replacing "/./" with "/",
    #    and removing "/../" along with the preceding path component.
    # 2: Replace runs of consecutive slashes with a single slash character.

    # See: https://chromium.googlesource.com/external/google-safe-browsing/+/06a8c4e799233da220ad7411e2bfacc74cbfbb37/python/expression.py#157
    result = path
    if not result:
        return "/"

    if result[0] != "/":
        result = f"/{result}"

    result = escape(result)
    parts: List[str] = []
    for part in result.split("/"):
        if part == "..":  # remove previous part (if any)
            if len(parts) > 0:
                parts.pop()
        elif part and part != ".":  # skip empty and .
            parts.append(part)

    result = f"/{'/'.join(parts)}"
    if path.endswith("/") and not result.endswith("/"):
        result = f"{result}/"
    # leading and trailing slashes added (if needed)

    # SPECIAL CASE: Handle URLs tacked on.
    result = result.replace("http:/", "http://").replace("https:/", "https://")
    return result

def canonical_url(url: Union[str, bytes]) ‑> str

Return a canonical version of url.

Expand source code

def canonical_url(url: Union[str, bytes]) -> str:
    """Return a canonical version of `url`."""
    # See: https://developers.google.com/safe-browsing/v4/urls-hashing#canonicalization
    # See: https://chromium.googlesource.com/external/google-safe-browsing/+/06a8c4e799233da220ad7411e2bfacc74cbfbb37/python/expression.py
    # 4: Remove tab (0x09), CR (0x0d), and LF (0x0a) characters from the URL.
    #    Do not remove escape sequences for these characters (e.g. '%0a').

    # 1: URL is valid RFC 2396
    # 2: Convert internationalized domain name (IDN) to ASCII Punycode.
    # 3: URL must include a path.
    # 5: Remove the fragment.
    # 6: Repeatedly percent-unescape the URL until it has no more percent-escapes.
    url = url.strip()  # no leading or trailing whitespace

    has_end_q = False
    if isinstance(url, bytes):
        url = url.replace(b"\t", b"").replace(b"\r", b"").replace(b"\n", b"")  # Rule 4
        url = url.replace(b"#!", b"?_escaped_fragment_=")  # Different than Google
        if (pos := url.find(b"#")) >= 0:
            url = url[0:pos]  # Rule 5
        has_end_q = url.endswith(b"?")
    else:
        url = url.replace("\t", "").replace("\r", "").replace("\n", "")  # Rule 4
        url = url.replace("#!", "?_escaped_fragment_=")  # Different than Google
        if (pos := url.find("#")) >= 0:
            url = url[0:pos]  # Rule 5
        has_end_q = url.endswith("?")

    url = escape(url)  # in case, e.g., scheme has encoded characters
    if url.startswith(":") and len(url) > 1:
        url = f"localhost{url}"
    url = provide_url_scheme(url, DEFAULT_SCHEME)

    parts = AttrDict(deconstruct_url(url)._asdict())
    if not parts.host:
        return ""

    parts.scheme = parts.scheme.lower() if parts.scheme else DEFAULT_SCHEME
    parts.userinfo = normalize_userinfo(parts.userinfo)
    parts.host = canonical_host(parts.host)
    parts.port = normalize_port(parts.port, parts.scheme)
    parts.path = canonical_path(cast(str, parts.path))
    parts.query = normalize_query(parts.query)
    parts.fragment = ""

    result = str(reconstruct_url(parts))
    if has_end_q and not result.endswith("?"):
        result += "?"

    return result