gimp-file-jbig2pdf/jbig2topdf_lossless.py

235 lines
6.3 KiB
Python
Raw Normal View History

2024-11-09 03:55:47 +11:00
#!/usr/bin/env python3
# GIMP plug-in for JBIG2-encoded PDF files
# Copyright (C) 2024 Lee Yingtong Li (RunasSudo)
#
# Adapted from jbig2enc by Adam Langley <agl@imperialviolet.org> - Copyright (C) 2006 Google Inc., licensed under Apache v2
# Loosely adapted from file-openraster.py - Copyright (C) 2009 by Jon Nordby <jononor@gmail.com>, licensed under the GPLv3
# In turn based on MyPaint source code by Martin Renold
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import glob
import struct
import sys
from pathlib import Path
dpi = 72 # Default DPI value
class Ref:
def __init__(self, x: int):
self.x = x
def __str__(self) -> str:
return f"{self.x} 0 R"
class Dict:
def __init__(self, values: dict = None):
if values is None:
values = {}
self.d = values.copy()
def __str__(self) -> str:
entries = [f"/{key} {value}" for key, value in self.d.items()]
return f"<< {' '.join(entries)} >>\n"
class Obj:
next_id = 1
def __init__(self, d: dict = None, stream: str = None):
if d is None:
d = {}
if stream is not None:
d["Length"] = str(len(stream))
self.d = Dict(d)
self.stream = stream
self.id = Obj.next_id
Obj.next_id += 1
def __str__(self) -> str:
result = [str(self.d)]
if self.stream is not None:
result.append(f"stream\n{self.stream}\nendstream\n")
result.append("endobj\n")
return "".join(result)
class Doc:
def __init__(self):
self.objs = []
self.pages = []
def add_object(self, obj: Obj) -> Obj:
"""Adds an object to the document."""
self.objs.append(obj)
return obj
def add_page(self, page: Obj) -> Obj:
"""Adds a page to the document and the list of objects."""
self.pages.append(page)
return self.add_object(page)
def __str__(self) -> str:
output = []
offsets = []
current_offset = 0
def add_line(line: str):
nonlocal current_offset
output.append(line)
current_offset += len(line) + 1 # Adding 1 for the newline character
# PDF header
add_line("%PDF-1.4")
# Add each object and track its byte offset
for obj in self.objs:
offsets.append(current_offset)
add_line(f"{obj.id} 0 obj")
add_line(str(obj))
# Cross-reference table
xref_start = current_offset
add_line("xref")
add_line(f"0 {len(offsets) + 1}")
add_line("0000000000 65535 f ")
for offset in offsets:
add_line(f"{offset:010} 00000 n ")
# Trailer and EOF
add_line("trailer")
add_line(f"<< /Size {len(offsets) + 1}\n/Root 1 0 R >>")
add_line("startxref")
add_line(str(xref_start))
add_line("%%EOF")
return "\n".join(output)
def ref(x: int) -> str:
"""Creates a PDF reference string."""
return f"{x} 0 R"
def create_pdf(symboltable: str = "symboltable", pagefiles: list = None, outf = sys.stdout.buffer):
"""Creates a PDF document from a symbol table and a list of page files."""
if pagefiles is None:
pagefiles = glob.glob("page-*")
doc = Doc()
# Add catalog and outlines objects
catalog_obj = Obj({"Type": "/Catalog", "Outlines": ref(2), "Pages": ref(3)})
outlines_obj = Obj({"Type": "/Outlines", "Count": "0"})
pages_obj = Obj({"Type": "/Pages"})
doc.add_object(catalog_obj)
doc.add_object(outlines_obj)
doc.add_object(pages_obj)
# Read the symbol table
# RUNASSUDO: Remove this as unnecessary with lossless encoding
#try:
# with open(symboltable, "rb") as sym_file:
# symd = doc.add_object(Obj({}, sym_file.read().decode("latin1")))
#except IOError:
# sys.stderr.write(f"Error reading symbol table: {symboltable}\n")
# return
page_objs = []
pagefiles.sort()
for p in pagefiles:
try:
with open(p, mode="rb") as page_file:
contents = page_file.read()
except IOError:
sys.stderr.write(f"Error reading page file: {p}\n")
continue
try:
width, height, xres, yres = struct.unpack(">IIII", contents[11:27])
except struct.error:
sys.stderr.write(f"Error unpacking page file: {p}\n")
continue
# Set default resolution if missing
xres = xres or dpi
yres = yres or dpi
# Create XObject (image) for the page
xobj = Obj(
{
"Type": "/XObject",
"Subtype": "/Image",
"Width": str(width),
"Height": str(height),
"ColorSpace": "/DeviceGray",
"BitsPerComponent": "1",
"Filter": "/JBIG2Decode",
#"DecodeParms": f"<< /JBIG2Globals {symd.id} 0 R >>", # RUNASSUDO: Remove this as unnecessary with lossless encoding
},
contents.decode("latin1"),
)
# Create content stream for the page
contents_obj = Obj(
{},
f"q {float(width * 72) / xres} 0 0 {float(height * 72) / yres} 0 0 cm /Im1 Do Q",
)
# Create resource dictionary for the page
resources_obj = Obj(
{"ProcSet": "[/PDF /ImageB]", "XObject": f"<< /Im1 {xobj.id} 0 R >>"}
)
# Create the page object
page_obj = Obj(
{
"Type": "/Page",
"Parent": "3 0 R",
"MediaBox": f"[ 0 0 {float(width * 72) / xres} {float(height * 72) / yres} ]",
"Contents": ref(contents_obj.id),
"Resources": ref(resources_obj.id),
}
)
# Add objects to the document
for obj in (xobj, contents_obj, resources_obj, page_obj):
doc.add_object(obj)
page_objs.append(page_obj)
# Update pages object
pages_obj.d.d["Count"] = str(len(page_objs))
pages_obj.d.d["Kids"] = "[" + " ".join([ref(x.id) for x in page_objs]) + "]"
# Output the final PDF document to stdout
outf.write(str(doc).encode("latin1"))
def usage(script, msg):
"""Display usage information and an optional error message."""
if msg:
sys.stderr.write("%s: %s\n" % (script, msg))
sys.stderr.write("Usage: %s [file_basename] > out.pdf\n" % script)
sys.exit(1)
if __name__ == "__main__":
# RUNASSUDO: Overhauled to use lossless encoding
pages = sys.argv[1:]
# Validate that pages were found
if not pages:
usage(sys.argv[0], "no pages found!")
create_pdf(None, pages)