Package pdfmerge
Command-line utility for merging, splicing, and rotating PDF documents.
Why?
I find myself merging bits of different PDFs fairly regularly and really wanted a simple CLI way to do it.
Install
python -m pip install pdfmerge
Usage
$ pdfmerge [-h] [--version]
[-o FILE|--output FILE]
[-p PASSWORD|--password PASSWORD]
PATH[RULE[, RULE ...]] [[PATH[RULE, ...]] ...]
-o
,--output
output file (default:output.pdf
).-p
, '–password` password for encrypted files (default: empty string).PATH
a file, directory, or wildcard string (e.g.,file*.pdf
) of files to merge.-
RULE
an optional string indicating which pages to extract and rotate. The syntax for each rule is:[START][..][END][ROTATE]
Where START
and END
are positive (1-based) or negative page numbers and
ROTATE
is one of >
, V
, or <
indicating a clockwise rotation of
90, 180, 270 degrees, respectively.
Command-line Example
$ pdfmerge -o out.pdf file1.pdf file2.pdf[3,3] file2.pdf[1V,2..-1] "other*.pdf[<]" "/path/pdf[1..4>,5]"
This example illustrates several features:
- specifying an output file
- merging multiple files, some more than once
- splicing parts of file using indices (1-based; negatives allowed)
- including the same page multiple times
- rotating a page or page range
- merging all the PDFs in a directory
Python Module Usage
pdfmerge()
can also be imported into python scripts.
from pdfmerge import pdfmerge
pdfmerge(["pdf-1.pdf", "pdf-2.pdf[2>]"], "output.pdf")
License
Expand source code
"""Command-line utility for merging, splicing, and rotating PDF documents.
.. include:: ../../README.md
:start-line: 2
"""
# native
from __future__ import annotations
from dataclasses import dataclass
from dataclasses import field
from getpass import getpass
from glob import glob
from pathlib import Path
from typing import Any
from typing import Generic
from typing import List
from typing import Optional
from typing import Sequence
from typing import TypeVar
from typing import Union
import re
import sys
import tempfile
import shutil
# lib
from pypdf import PdfReader
from pypdf import PdfWriter
__all__ = ("__version__", "pdfmerge")
__version__ = "1.0.1"
ERR_PATH = "ERROR: path not found: {0}"
"""Error when a path is not found."""
ERR_RULE = "ERROR: invalid rule: {0}"
"""Error when an invalid rule is encountered."""
ERR_RANGE = "ERROR: page {0} out of range [1-{1}]"
"""Error when a page is outside of the available ranges."""
ERR_BOUNDS = "ERROR: missing upper bound on range [{0}..]"
"""Error when the upper range is missing."""
# RE_MATCH_TYPE = type(re.match("", ""))
RE_HAS_RULE = re.compile(r"^(.*)\[(.*)\]$")
"""Regex that matches when an input has a rule."""
RE_RULE = re.compile(r"^(-?\d+)?(\.\.)?(-?\d+)?([>V<])?$")
"""Regex that checks that the rule is valid."""
RULE_RANGE = ".."
"""Range indicator."""
RULE_ROTATE = {None: 0, ">": 90, "V": 180, "<": 270}
"""Rotation rules."""
RULE_DEFAULT = RULE_RANGE
"""Default rule is all pages, unrotated."""
T = TypeVar("T")
"""Generic type."""
@dataclass
class Parsed(Generic[T]):
"""Generic set of parsed results."""
done: List[T] = field(default_factory=list)
"""Items that were parsed without error."""
errors: List[str] = field(default_factory=list)
"""Errors encountered during parsing."""
def __add__(self, other: Parsed[Any]) -> Parsed[Any]:
"""Merge this result with another result."""
return Parsed(self.done + other.done, self.errors + other.errors)
def __iadd__(self, other: Parsed[Any]) -> Parsed[Any]:
"""Update this result with data from another result."""
self.done += other.done
self.errors += other.errors
return self
@dataclass
class ParsedPath:
"""Parsed path and its rules."""
path: Path
"""Path to process."""
rules: List[str] = field(default_factory=list)
"""Rules to apply."""
def rangify(
rule: Union[str, re.Match[str]],
last: int,
) -> Union[List[int], range]:
"""Convert a rule into a range.
Args:
rule (str, Match): pages to extract or a regex matching the rule
last (int): maximum number of pages
Returns:
Union[List[int], range]: list or `range` of pages to extract.
Examples:
>>> list(rangify('', 3))
[1, 2, 3]
>>> list(rangify('1', 10))
[1]
>>> list(rangify('1..3', 10))
[1, 2, 3]
>>> list(rangify('3..1', 10))
[3, 2, 1]
>>> rangify('1..', 5) == rangify('..', 5)
True
>>> list(rangify('-3..-1', 5))
[3, 4, 5]
>>> list(rangify(RE_RULE.search('5..7'), 3))
[3]
"""
result: Union[List[int], range]
match: Optional[re.Match[str]]
if isinstance(rule, str):
match = RE_RULE.search(rule)
assert match, ERR_RULE.format(rule)
elif isinstance(rule, re.Match):
assert rule is not None, ERR_RULE.format()
match = rule
if not match: # pragma: no cover
return []
left, has_range, right, _ = match.groups()
is_range = has_range == RULE_RANGE
beg, end = 1, last
if not left and not right: # [""] => [..]
is_range = True
if left:
beg = int(left)
if beg < 1: # too low
beg += last + 1
elif beg > last: # too high
beg = last
if right:
end = int(right)
if end < 1: # too low
end += last + 1
elif end > last: # too high
end = last
elif is_range:
end = last
# Generate ranges:
if is_range and end < beg:
result = range(beg, end - 1, -1)
elif is_range:
result = range(beg, end + 1)
else:
result = [beg]
return result
def add_path(
writer: PdfWriter,
item: ParsedPath,
password: Optional[str] = None,
) -> PdfWriter:
"""Add some PDF pages to a PDF writer.
Args:
writer (PdfFileWriter): writer to add to.
item (ParsedPath): the path and rules to add.
password (str, optional): password for encrypted files. Defaults to `None`.
Returns:
PdfFileWriter: the writer object
"""
reader = PdfReader(item.path.open("rb"))
if reader.is_encrypted:
if password is None:
print(f"Reading encrypted PDF <{item.path}>")
password = getpass()
reader.decrypt(password)
for rule in item.rules:
match = RE_RULE.search(rule)
assert match, ERR_RULE.format(rule)
rotate = match.group(4)
for num in rangify(match, len(reader.pages)):
writer.add_page(reader.pages[num - 1].rotate(RULE_ROTATE[rotate]))
return writer
def parse_paths(
inputs: Sequence[str],
_rule: Union[str, List[str]] = RULE_DEFAULT,
) -> Parsed[ParsedPath]:
"""Split inputs into `Path` and rules.
Args:
inputs (Sequence[str]): inputs to parse
_rule (str, optional): default rule to apply. Defaults to `RULE_DEFAULT`.
Returns:
Parsed:
- `.done`: contains a list of successfully parsed items
- `.path`: `Path` to the file
- `.rules`: list of rules to apply
- `.errors`: list of errors encountered
"""
result: Parsed[ParsedPath] = Parsed()
if isinstance(_rule, str):
_rule = [_rule]
for item in inputs:
ok = True
path = None
rules = _rule
has_rule = RE_HAS_RULE.search(item)
if has_rule:
item = has_rule.group(1)
rules = re.sub(r"\s", "", has_rule.group(2)).split(",")
for rule in rules:
if not RE_RULE.search(rule):
ok, err = False, ERR_RULE.format(rule)
if err not in result.errors:
result.errors.append(err)
# rules checked
path = Path(item)
if path.is_dir():
paths = [str(p) for p in path.glob("*.pdf")]
return result + parse_paths(sorted(paths), rules)
# folder full of PDFs handled
if "*" in item:
return result + parse_paths(sorted(glob(item)), rules)
# glob handled
if not path.exists():
ok, err = False, ERR_PATH.format(path)
if err not in result.errors:
result.errors.append(err)
# path checked
if ok:
result.done.append(ParsedPath(path=path, rules=rules))
return result
def pdfmerge(inputs: List[str], output: str, password: Optional[str] = None) -> None:
"""Merge PDFs into a single PDF.
Args:
inputs (List[str]): list of paths to merge with optional rules
output (str): output file name
password (str, optional): password for encrypted files. Defaults to `None`.
"""
parsed = parse_paths(inputs)
if parsed.errors:
for err in parsed.errors:
print(err, file=sys.stderr)
return
with tempfile.NamedTemporaryFile(delete=False) as temp:
writer = PdfWriter(temp)
for item in parsed.done:
add_path(writer, item, password)
writer.write_stream(temp)
temp.flush()
shutil.move(temp.name, Path(output).expanduser().resolve())
__pdoc__ = {"pdfmerge.__main__": True}
Sub-modules
pdfmerge.__main__
-
pdfmerge - Merge, splice, and rotate PDFs …
Functions
def pdfmerge(inputs: List[str], output: str, password: Optional[str] = None) ‑> None
-
Merge PDFs into a single PDF.
Args
inputs
:List[str]
- list of paths to merge with optional rules
output
:str
- output file name
password
:str
, optional- password for encrypted files. Defaults to
None
.
Expand source code
def pdfmerge(inputs: List[str], output: str, password: Optional[str] = None) -> None: """Merge PDFs into a single PDF. Args: inputs (List[str]): list of paths to merge with optional rules output (str): output file name password (str, optional): password for encrypted files. Defaults to `None`. """ parsed = parse_paths(inputs) if parsed.errors: for err in parsed.errors: print(err, file=sys.stderr) return with tempfile.NamedTemporaryFile(delete=False) as temp: writer = PdfWriter(temp) for item in parsed.done: add_path(writer, item, password) writer.write_stream(temp) temp.flush() shutil.move(temp.name, Path(output).expanduser().resolve())