gimp-file-jbig2pdf/jbig2topdf_lossless.py

#!/usr/bin/env python3

#   GIMP plug-in for JBIG2-encoded PDF files
#   Copyright (C) 2024  Lee Yingtong Li (RunasSudo)
#
#   Adapted from jbig2enc by Adam Langley <agl@imperialviolet.org> - Copyright (C) 2006 Google Inc., licensed under Apache v2
#   Loosely adapted from file-openraster.py - Copyright (C) 2009 by Jon Nordby <jononor@gmail.com>, licensed under the GPLv3
#   In turn based on MyPaint source code by Martin Renold
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.

import glob
import struct
import sys
from pathlib import Path

dpi = 72  # Default DPI value

class Ref:
	def __init__(self, x: int):
		self.x = x
	
	def __str__(self) -> str:
		return f"{self.x} 0 R"

class Dict:
	def __init__(self, values: dict = None):
		if values is None:
			values = {}
		self.d = values.copy()
	
	def __str__(self) -> str:
		entries = [f"/{key} {value}" for key, value in self.d.items()]
		return f"<< {' '.join(entries)} >>\n"

class Obj:
	next_id = 1
	
	def __init__(self, d: dict = None, stream: str = None):
		if d is None:
			d = {}
		if stream is not None:
			d["Length"] = str(len(stream))
		self.d = Dict(d)
		self.stream = stream
		self.id = Obj.next_id
		Obj.next_id += 1
	
	def __str__(self) -> str:
		result = [str(self.d)]
		if self.stream is not None:
			result.append(f"stream\n{self.stream}\nendstream\n")
		result.append("endobj\n")
		return "".join(result)

class Doc:
	def __init__(self):
		self.objs = []
		self.pages = []
	
	def add_object(self, obj: Obj) -> Obj:
		"""Adds an object to the document."""
		self.objs.append(obj)
		return obj
	
	def add_page(self, page: Obj) -> Obj:
		"""Adds a page to the document and the list of objects."""
		self.pages.append(page)
		return self.add_object(page)
	
	def __str__(self) -> str:
		output = []
		offsets = []
		current_offset = 0
		
		def add_line(line: str):
			nonlocal current_offset
			output.append(line)
			current_offset += len(line) + 1  # Adding 1 for the newline character
		
		# PDF header
		add_line("%PDF-1.4")
		
		# Add each object and track its byte offset
		for obj in self.objs:
			offsets.append(current_offset)
			add_line(f"{obj.id} 0 obj")
			add_line(str(obj))
		
		# Cross-reference table
		xref_start = current_offset
		add_line("xref")
		add_line(f"0 {len(offsets) + 1}")
		add_line("0000000000 65535 f ")
		for offset in offsets:
			add_line(f"{offset:010} 00000 n ")
		
		# Trailer and EOF
		add_line("trailer")
		add_line(f"<< /Size {len(offsets) + 1}\n/Root 1 0 R >>")
		add_line("startxref")
		add_line(str(xref_start))
		add_line("%%EOF")
		
		return "\n".join(output)

def ref(x: int) -> str:
	"""Creates a PDF reference string."""
	return f"{x} 0 R"

def create_pdf(symboltable: str = "symboltable", pagefiles: list = None, outf = sys.stdout.buffer):
	"""Creates a PDF document from a symbol table and a list of page files."""
	if pagefiles is None:
		pagefiles = glob.glob("page-*")
	
	doc = Doc()
	
	# Add catalog and outlines objects
	catalog_obj = Obj({"Type": "/Catalog", "Outlines": ref(2), "Pages": ref(3)})
	outlines_obj = Obj({"Type": "/Outlines", "Count": "0"})
	pages_obj = Obj({"Type": "/Pages"})
	
	doc.add_object(catalog_obj)
	doc.add_object(outlines_obj)
	doc.add_object(pages_obj)
	
	# Read the symbol table
	# RUNASSUDO: Remove this as unnecessary with lossless encoding
	#try:
	#	with open(symboltable, "rb") as sym_file:
	#		symd = doc.add_object(Obj({}, sym_file.read().decode("latin1")))
	#except IOError:
	#	sys.stderr.write(f"Error reading symbol table: {symboltable}\n")
	#	return
	
	page_objs = []
	pagefiles.sort()
	
	for p in pagefiles:
		try:
			with open(p, mode="rb") as page_file:
				contents = page_file.read()
		except IOError:
			sys.stderr.write(f"Error reading page file: {p}\n")
			continue
		
		try:
			width, height, xres, yres = struct.unpack(">IIII", contents[11:27])
		except struct.error:
			sys.stderr.write(f"Error unpacking page file: {p}\n")
			continue
		
		# Set default resolution if missing
		xres = xres or dpi
		yres = yres or dpi
		
		# Create XObject (image) for the page
		xobj = Obj(
			{
				"Type": "/XObject",
				"Subtype": "/Image",
				"Width": str(width),
				"Height": str(height),
				"ColorSpace": "/DeviceGray",
				"BitsPerComponent": "1",
				"Filter": "/JBIG2Decode",
				#"DecodeParms": f"<< /JBIG2Globals {symd.id} 0 R >>",  # RUNASSUDO: Remove this as unnecessary with lossless encoding
			},
			contents.decode("latin1"),
		)
		
		# Create content stream for the page
		contents_obj = Obj(
			{},
			f"q {float(width * 72) / xres} 0 0 {float(height * 72) / yres} 0 0 cm /Im1 Do Q",
		)
		
		# Create resource dictionary for the page
		resources_obj = Obj(
			{"ProcSet": "[/PDF /ImageB]", "XObject": f"<< /Im1 {xobj.id} 0 R >>"}
		)
		
		# Create the page object
		page_obj = Obj(
			{
				"Type": "/Page",
				"Parent": "3 0 R",
				"MediaBox": f"[ 0 0 {float(width * 72) / xres} {float(height * 72) / yres} ]",
				"Contents": ref(contents_obj.id),
				"Resources": ref(resources_obj.id),
			}
		)
		
		# Add objects to the document
		for obj in (xobj, contents_obj, resources_obj, page_obj):
			doc.add_object(obj)
		
		page_objs.append(page_obj)
		
		# Update pages object
		pages_obj.d.d["Count"] = str(len(page_objs))
		pages_obj.d.d["Kids"] = "[" + " ".join([ref(x.id) for x in page_objs]) + "]"
	
	# Output the final PDF document to stdout
	outf.write(str(doc).encode("latin1"))

def usage(script, msg):
	"""Display usage information and an optional error message."""
	if msg:
		sys.stderr.write("%s: %s\n" % (script, msg))
	sys.stderr.write("Usage: %s [file_basename] > out.pdf\n" % script)
	sys.exit(1)

if __name__ == "__main__":
	# RUNASSUDO: Overhauled to use lossless encoding
	
	pages = sys.argv[1:]
	
	# Validate that pages were found
	if not pages:
		usage(sys.argv[0], "no pages found!")
	
	create_pdf(None, pages)
Encode JBIG2 losslessly 2024-11-09 03:55:47 +11:00			`#!/usr/bin/env python3`

			`# GIMP plug-in for JBIG2-encoded PDF files`
			`# Copyright (C) 2024 Lee Yingtong Li (RunasSudo)`
			`#`
			`# Adapted from jbig2enc by Adam Langley <agl@imperialviolet.org> - Copyright (C) 2006 Google Inc., licensed under Apache v2`
			`# Loosely adapted from file-openraster.py - Copyright (C) 2009 by Jon Nordby <jononor@gmail.com>, licensed under the GPLv3`
			`# In turn based on MyPaint source code by Martin Renold`
			`#`
			`# This program is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with this program. If not, see <https://www.gnu.org/licenses/>.`

			`import glob`
			`import struct`
			`import sys`
			`from pathlib import Path`

			`dpi = 72 # Default DPI value`

			`class Ref:`
			`def __init__(self, x: int):`
			`self.x = x`

			`def __str__(self) -> str:`
			`return f"{self.x} 0 R"`

			`class Dict:`
			`def __init__(self, values: dict = None):`
			`if values is None:`
			`values = {}`
			`self.d = values.copy()`

			`def __str__(self) -> str:`
			`entries = [f"/{key} {value}" for key, value in self.d.items()]`
			`return f"<< {' '.join(entries)} >>\n"`

			`class Obj:`
			`next_id = 1`

			`def __init__(self, d: dict = None, stream: str = None):`
			`if d is None:`
			`d = {}`
			`if stream is not None:`
			`d["Length"] = str(len(stream))`
			`self.d = Dict(d)`
			`self.stream = stream`
			`self.id = Obj.next_id`
			`Obj.next_id += 1`

			`def __str__(self) -> str:`
			`result = [str(self.d)]`
			`if self.stream is not None:`
			`result.append(f"stream\n{self.stream}\nendstream\n")`
			`result.append("endobj\n")`
			`return "".join(result)`

			`class Doc:`
			`def __init__(self):`
			`self.objs = []`
			`self.pages = []`

			`def add_object(self, obj: Obj) -> Obj:`
			`"""Adds an object to the document."""`
			`self.objs.append(obj)`
			`return obj`

			`def add_page(self, page: Obj) -> Obj:`
			`"""Adds a page to the document and the list of objects."""`
			`self.pages.append(page)`
			`return self.add_object(page)`

			`def __str__(self) -> str:`
			`output = []`
			`offsets = []`
			`current_offset = 0`

			`def add_line(line: str):`
			`nonlocal current_offset`
			`output.append(line)`
			`current_offset += len(line) + 1 # Adding 1 for the newline character`

			`# PDF header`
			`add_line("%PDF-1.4")`

			`# Add each object and track its byte offset`
			`for obj in self.objs:`
			`offsets.append(current_offset)`
			`add_line(f"{obj.id} 0 obj")`
			`add_line(str(obj))`

			`# Cross-reference table`
			`xref_start = current_offset`
			`add_line("xref")`
			`add_line(f"0 {len(offsets) + 1}")`
			`add_line("0000000000 65535 f ")`
			`for offset in offsets:`
			`add_line(f"{offset:010} 00000 n ")`

			`# Trailer and EOF`
			`add_line("trailer")`
			`add_line(f"<< /Size {len(offsets) + 1}\n/Root 1 0 R >>")`
			`add_line("startxref")`
			`add_line(str(xref_start))`
			`add_line("%%EOF")`

			`return "\n".join(output)`

			`def ref(x: int) -> str:`
			`"""Creates a PDF reference string."""`
			`return f"{x} 0 R"`

			`def create_pdf(symboltable: str = "symboltable", pagefiles: list = None, outf = sys.stdout.buffer):`
			`"""Creates a PDF document from a symbol table and a list of page files."""`
			`if pagefiles is None:`
			`pagefiles = glob.glob("page-*")`

			`doc = Doc()`

			`# Add catalog and outlines objects`
			`catalog_obj = Obj({"Type": "/Catalog", "Outlines": ref(2), "Pages": ref(3)})`
			`outlines_obj = Obj({"Type": "/Outlines", "Count": "0"})`
			`pages_obj = Obj({"Type": "/Pages"})`

			`doc.add_object(catalog_obj)`
			`doc.add_object(outlines_obj)`
			`doc.add_object(pages_obj)`

			`# Read the symbol table`
			`# RUNASSUDO: Remove this as unnecessary with lossless encoding`
			`#try:`
			`# with open(symboltable, "rb") as sym_file:`
			`# symd = doc.add_object(Obj({}, sym_file.read().decode("latin1")))`
			`#except IOError:`
			`# sys.stderr.write(f"Error reading symbol table: {symboltable}\n")`
			`# return`

			`page_objs = []`
			`pagefiles.sort()`

			`for p in pagefiles:`
			`try:`
			`with open(p, mode="rb") as page_file:`
			`contents = page_file.read()`
			`except IOError:`
			`sys.stderr.write(f"Error reading page file: {p}\n")`
			`continue`

			`try:`
			`width, height, xres, yres = struct.unpack(">IIII", contents[11:27])`
			`except struct.error:`
			`sys.stderr.write(f"Error unpacking page file: {p}\n")`
			`continue`

			`# Set default resolution if missing`
			`xres = xres or dpi`
			`yres = yres or dpi`

			`# Create XObject (image) for the page`
			`xobj = Obj(`
			`{`
			`"Type": "/XObject",`
			`"Subtype": "/Image",`
			`"Width": str(width),`
			`"Height": str(height),`
			`"ColorSpace": "/DeviceGray",`
			`"BitsPerComponent": "1",`
			`"Filter": "/JBIG2Decode",`
			`#"DecodeParms": f"<< /JBIG2Globals {symd.id} 0 R >>", # RUNASSUDO: Remove this as unnecessary with lossless encoding`
			`},`
			`contents.decode("latin1"),`
			`)`

			`# Create content stream for the page`
			`contents_obj = Obj(`
			`{},`
			`f"q {float(width * 72) / xres} 0 0 {float(height * 72) / yres} 0 0 cm /Im1 Do Q",`
			`)`

			`# Create resource dictionary for the page`
			`resources_obj = Obj(`
			`{"ProcSet": "[/PDF /ImageB]", "XObject": f"<< /Im1 {xobj.id} 0 R >>"}`
			`)`

			`# Create the page object`
			`page_obj = Obj(`
			`{`
			`"Type": "/Page",`
			`"Parent": "3 0 R",`
			`"MediaBox": f"[ 0 0 {float(width * 72) / xres} {float(height * 72) / yres} ]",`
			`"Contents": ref(contents_obj.id),`
			`"Resources": ref(resources_obj.id),`
			`}`
			`)`

			`# Add objects to the document`
			`for obj in (xobj, contents_obj, resources_obj, page_obj):`
			`doc.add_object(obj)`

			`page_objs.append(page_obj)`

			`# Update pages object`
			`pages_obj.d.d["Count"] = str(len(page_objs))`
			`pages_obj.d.d["Kids"] = "[" + " ".join([ref(x.id) for x in page_objs]) + "]"`

			`# Output the final PDF document to stdout`
			`outf.write(str(doc).encode("latin1"))`

			`def usage(script, msg):`
			`"""Display usage information and an optional error message."""`
			`if msg:`
			`sys.stderr.write("%s: %s\n" % (script, msg))`
			`sys.stderr.write("Usage: %s [file_basename] > out.pdf\n" % script)`
			`sys.exit(1)`

			`if __name__ == "__main__":`
			`# RUNASSUDO: Overhauled to use lossless encoding`

			`pages = sys.argv[1:]`

			`# Validate that pages were found`
			`if not pages:`
			`usage(sys.argv[0], "no pages found!")`

			`create_pdf(None, pages)`