Module `freshlinks.scraper`

Ping and scrape links.

Expand source code

"""Ping and scrape links."""

# std
from time import time
from typing import Iterator
from typing import Literal
from typing import Optional
from typing import Tuple
from urllib.parse import urljoin
import re

# lib
import requests
from requests import Response
from bs4 import BeautifulSoup

# pkg
from .cache import LinkEntry
from .canonicalize import canonical_url

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
    )
}
"""HTTP headers to send when pinging."""

CSS_IMPORT = re.compile(r"""@import ["']([^"']+)""", re.MULTILINE)
"""Match a CSS `@import` expression."""

CSS_URL = re.compile(r"url\s*\(\s*([^)]+)\s*\)", re.MULTILINE)
"""Match a CSS `url()` expression."""

LINK_ATTRS = ["href", "src"]
"""HTML attributes that contain a URL."""

LINK_SELECT = ",".join(f"[{a}]" for a in LINK_ATTRS)
"""Query for selecting HTML tags that have URLs."""

LINK_IGNORE = re.compile(r"^(#|data:|mailto:|tel:)")
"""Pattern of links to ignore."""


def ping_url(
    url: str, method: Literal["head", "get"] = "get"
) -> Tuple[Optional[Response], LinkEntry]:
    """Make a request and return the response (if any) and `LinkEntry`."""
    t = int(time())
    try:
        res = getattr(requests, method)(url, timeout=3, headers=HEADERS)
        return res, LinkEntry(time=t, code=res.status_code, url=url)
    except Exception as e:
        return None, LinkEntry(time=t, code=-1, err=str(e), url=url)


def css_links(text: str) -> Iterator[str]:
    """Yield the links in a CSS document."""
    for pattern in [CSS_IMPORT, CSS_URL]:
        for link in pattern.findall(text):
            yield link


def html_links(soup: BeautifulSoup) -> Iterator[str]:
    """Yield the links in an HTML document."""
    for tag in soup.select(LINK_SELECT):
        if tag.name == "link" and "preconnect" in tag.get_attribute_list("rel", []):
            continue  # skip non-URL connection hints

        for attr in LINK_ATTRS:
            if attr in tag.attrs:
                yield tag.get_attribute_list(attr, [""])[0]


def scrape_links(res: Response) -> Iterator[str]:
    """Yield links in a `Response` based on content-type."""
    base = res.url
    ctype = res.headers["Content-Type"]
    links: Iterator[str] = iter([])

    if not ctype.startswith("text/"):  # skip non-textual (images, zip files)
        pass
    elif ctype.startswith("text/css") or base.endswith(".css"):
        links = css_links(res.text)
    elif ctype.startswith("text/xml") or base.endswith(".xml"):
        links = html_links(BeautifulSoup(res.content, features="xml"))
    else:  # assume HTML
        soup = BeautifulSoup(res.content, features="html.parser")
        links = html_links(soup)
        for tag in soup.find_all("base"):
            if "href" in tag.attrs:
                base = tag["href"]
                break  # only the first href counts

    for link in links:
        if not LINK_IGNORE.match(link):
            yield canonical_url(urljoin(base, link))

Global variables

var HEADERS: HTTP headers to send when pinging.
var CSS_IMPORT: Match a CSS @import expression.
var CSS_URL: Match a CSS url() expression.
var LINK_ATTRS: HTML attributes that contain a URL.
var LINK_SELECT: Query for selecting HTML tags that have URLs.
var LINK_IGNORE: Pattern of links to ignore.

Functions

def ping_url(url: str, method: Literal['head', 'get'] = 'get') ‑> Tuple[Optional[requests.models.Response], LinkEntry]

Make a request and return the response (if any) and LinkEntry.

Expand source code

def ping_url(
    url: str, method: Literal["head", "get"] = "get"
) -> Tuple[Optional[Response], LinkEntry]:
    """Make a request and return the response (if any) and `LinkEntry`."""
    t = int(time())
    try:
        res = getattr(requests, method)(url, timeout=3, headers=HEADERS)
        return res, LinkEntry(time=t, code=res.status_code, url=url)
    except Exception as e:
        return None, LinkEntry(time=t, code=-1, err=str(e), url=url)

def css_links(text: str) ‑> Iterator[str]

Yield the links in a CSS document.

Expand source code

def css_links(text: str) -> Iterator[str]:
    """Yield the links in a CSS document."""
    for pattern in [CSS_IMPORT, CSS_URL]:
        for link in pattern.findall(text):
            yield link

def html_links(soup: bs4.BeautifulSoup) ‑> Iterator[str]

Yield the links in an HTML document.

Expand source code

def html_links(soup: BeautifulSoup) -> Iterator[str]:
    """Yield the links in an HTML document."""
    for tag in soup.select(LINK_SELECT):
        if tag.name == "link" and "preconnect" in tag.get_attribute_list("rel", []):
            continue  # skip non-URL connection hints

        for attr in LINK_ATTRS:
            if attr in tag.attrs:
                yield tag.get_attribute_list(attr, [""])[0]

def scrape_links(res: requests.models.Response) ‑> Iterator[str]

Yield links in a Response based on content-type.

Expand source code

def scrape_links(res: Response) -> Iterator[str]:
    """Yield links in a `Response` based on content-type."""
    base = res.url
    ctype = res.headers["Content-Type"]
    links: Iterator[str] = iter([])

    if not ctype.startswith("text/"):  # skip non-textual (images, zip files)
        pass
    elif ctype.startswith("text/css") or base.endswith(".css"):
        links = css_links(res.text)
    elif ctype.startswith("text/xml") or base.endswith(".xml"):
        links = html_links(BeautifulSoup(res.content, features="xml"))
    else:  # assume HTML
        soup = BeautifulSoup(res.content, features="html.parser")
        links = html_links(soup)
        for tag in soup.find_all("base"):
            if "href" in tag.attrs:
                base = tag["href"]
                break  # only the first href counts

    for link in links:
        if not LINK_IGNORE.match(link):
            yield canonical_url(urljoin(base, link))