pdf-segmented/pdf_segmented/output/pdf.py

#   pdf-segmented: Generate PDFs using separate compression for foreground and background
#   Copyright (C) 2025  Lee Yingtong Li
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.

from ..compression import CompressedLayer, CompressedPage
from ..compression.jbig2 import JBIG2Layer
from ..compression.jpeg import JPEGLayer
from ..input import InputPages

from pikepdf import ContentStreamInstruction, Name, Operator, Page, Pdf, Stream, unparse_content_stream

from typing import Generator

def pdf_write_pages(
	input_pages: InputPages,
	compressed_pages: Generator[CompressedPage],
	output_file: str
) -> None:
	# Get size of image in PostScript points
	width_pt = input_pages.width / input_pages.dpi * 72
	height_pt = input_pages.height / input_pages.dpi * 72

	# Build PDF
	pdf = Pdf.new()

	# Write each page
	for compressed_page in compressed_pages:
		page = pdf.add_blank_page(page_size=(width_pt, height_pt))

		# Write each layer to the page
		content_instructions = []
		pdf_write_layer(input_pages=input_pages, pdf=pdf, page=page, layer=compressed_page.bg, content_instructions=content_instructions)
		pdf_write_layer(input_pages=input_pages, pdf=pdf, page=page, layer=compressed_page.fg, content_instructions=content_instructions)

		# Generate content stream
		wrapped_instructions = [
			ContentStreamInstruction([], Operator('q')),
			ContentStreamInstruction([width_pt, 0, 0, height_pt, 0, 0], Operator('cm'))
		] + content_instructions + [
			ContentStreamInstruction([], Operator('Q')),
		]
		content_stream = unparse_content_stream(wrapped_instructions)
		page.Contents.write(content_stream)

	# Save PDF
	pdf.save(output_file)

def pdf_write_layer(
	input_pages: InputPages,
	pdf: Pdf,
	page: Page,
	layer: CompressedLayer,
	content_instructions,
) -> None:

	# Write the layer to PDF
	if isinstance(layer, JBIG2Layer):
		pdf_write_image(
			input_pages=input_pages,
			pdf=pdf,
			page=page,
			value=layer.data,
			content_instructions=content_instructions,
			ColorSpace=Name.DeviceGray,
			Filter=Name.JBIG2Decode,
			BitsPerComponent=1,
			Mask=[1, 1]  # Layer mask
		)
	elif isinstance(layer, JPEGLayer):
		pdf_write_image(
			input_pages=input_pages,
			pdf=pdf,
			page=page,
			value=layer.data,
			content_instructions=content_instructions,
			ColorSpace=Name.DeviceRGB,
			Filter=Name.DCTDecode,
			BitsPerComponent=8
		)
	else:
		raise NotImplementedError()

def pdf_write_image(
	input_pages: InputPages,
	pdf: Pdf,
	page: Page,
	value: bytes,
	content_instructions,
	**kwargs
) -> None:

	# Write the layer as an Image

	# Insert the Image as an XObject resource
	xobj = Stream(
		pdf,
		value,
		Type=Name.XObject,
		Subtype=Name.Image,
		Width=input_pages.width,
		Height=input_pages.height,
		**kwargs
	)
	xobj_name = page.add_resource(xobj, '/XObject')

	# Add render instruction to the content stream
	content_instructions.append(
		ContentStreamInstruction([xobj_name], Operator('Do'))
	)