"""PDF Syntax Helpers Functions in this module take variable input and produce PDF Syntax features as they are described in the Adobe PDF Reference Manual, found here: http://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf Most of what happens in a PDF happens in objects, which are formatted like so:
3 0 obj <> endobjThe first line says that this is the third object in the structure of the document. There are 8 kinds of objects (Adobe Reference, 51): * Boolean values * Integer and real numbers * Strings * Names * Arrays * Dictionaries * Streams * The null object The `<<` in the second line and the `>>` in the line preceding `endobj` denote that it is a dictionary object. Dictionaries map Names to other objects. Names are the strings preceded by '/', valid Names do not have to start with a capital letter, they can be any ascii characters, # and two characters can escape non-printable ascii characters, described on page 57. `3 0 obj` means what follows here is the third object, but the name Type (represented here by `/Type`) is mapped to an indirect object reference: `0 obj` vs `0 R`. (Page 64 of Adobe Reference) The structure of this data, in python/dict form, is thus: third_obj = { '/Type': '/Page'), '/Parent': iobj_ref(1), '/Resources': iobj_ref(2), '/Contents': iobj_ref(4), } Some additional notes: Streams are of the form:
4 0 obj <> stream 68656c6c6f20776f726c64 endstream endobjIn this case, the ASCIIHexDecode filter is used because "68656c6c6f20776f726c64" is "hello world" in ascii, and 22 is the length of that string. As of this writing, I am not sure how length is actually calculated, so this remains something to be looked into. """ from abc import ABC def clear_empty_fields(d): return {k: v for k, v in d.items() if v} def create_dictionary_string( dict_, open_dict="<<", close_dict=">>", field_join="\n", key_value_join=" ", has_empty_fields=False, ): """format dictionary as PDF dictionary @param dict_: dictionary of values to render @param open_dict: string to open PDF dictionary @param close_dict: string to close PDF dictionary @param field_join: string to join fields with @param key_value_join: string to join key to value with @param has_empty_fields: whether or not to clear_empty_fields first. """ if has_empty_fields: dict_ = clear_empty_fields(dict_) return "".join( [ open_dict, field_join.join(key_value_join.join(map(str, f)) for f in dict_.items()), close_dict, ] ) def create_list_string(list_): """format list of strings as PDF array""" return f"[{' '.join(list_)}]" def iobj_ref(n): """format an indirect PDF Object reference from its id number""" return f"{n} 0 R" def create_stream(stream): if isinstance(stream, (bytearray, bytes)): stream = str(stream, "latin-1") return "\n".join(["stream", stream, "endstream"]) class PDFObject: """ Main features of this class: * delay ID assignement * implement serializing To ensure consistency on how the serialize() method operates, child classes must define a __slots__ attribute. """ # pylint: disable=redefined-builtin def __init__(self, id=None): self._id = id @property def id(self): if self._id is None: raise AttributeError( f"{self.__class__.__name__} has not been assigned an ID yet" ) return self._id @id.setter def id(self, n): self._id = n @property def ref(self): return iobj_ref(self.id) def serialize(self, fpdf=None, obj_dict=None): output = [] if fpdf: # pylint: disable=protected-access appender = fpdf._out assert ( fpdf._newobj() == self.id ), "Something went wrong in StructTree object IDs assignement" else: appender = output.append appender(f"{self.id} 0 obj") appender("<<") if not obj_dict: obj_dict = self._build_obj_dict() appender(create_dictionary_string(obj_dict, open_dict="", close_dict="")) appender(">>") appender("endobj") return "\n".join(output) def _build_obj_dict(self): """ Build the PDF Object associative map to serialize, based on this class instance properties. The property names are converted to CamelCase, and prefixed with a slash character "/". """ obj_dict = {} for key in dir(self): value = getattr(self, key) if ( callable(value) or key.startswith("_") or key in ("id", "ref") or value is None ): continue if isinstance(value, PDFObject): # indirect object reference value = value.ref elif hasattr(value, "serialize"): # e.g. PDFArray & PDFString value = value.serialize() obj_dict[f"/{camel_case(key)}"] = value return obj_dict def camel_case(property_name): return "".join(x for x in property_name.title() if x != "_") class PDFString(str): def serialize(self): # Filtering out characters that are not encodable as Latin1 for now, # as an outline /Title seemingly cannot "just" be encoded as UTF-16BE: return f'({self.encode("latin-1", "ignore").decode("latin-1")})' class PDFArray(list): def serialize(self): if all(isinstance(elem, PDFObject) for elem in self): serialized_elems = "\n".join(elem.ref for elem in self) elif all(isinstance(elem, int) for elem in self): serialized_elems = " ".join(map(str, self)) else: raise NotImplementedError(f"PDFArray.serialize with self={self}") return f"[{serialized_elems}]" # cf. section 8.2.1 "Destinations" of the 2006 PDF spec 1.7: class Destination(ABC): def as_str(self, pdf=None): raise NotImplementedError class DestinationXYZ(Destination): def __init__(self, page, x=0, y=0, zoom="null", page_as_obj_id=True): self.page = page self.x = x self.y = y self.zoom = zoom self.page_as_obj_id = page_as_obj_id def __repr__(self): return f'DestinationXYZ(page={self.page}, x={self.x}, y={self.y}, zoom="{self.zoom}", page_as_obj_id={self.page_as_obj_id})' def as_str(self, pdf=None): left = self.x * pdf.k if pdf else self.x if isinstance(left, float): left = round(left, 2) top = (pdf.h_pt - self.y * pdf.k) if pdf else self.y if isinstance(top, float): top = round(top, 2) # The page object ID is predictable given that _putpages is invoked first in _enddoc: page = iobj_ref(2 * self.page + 1) if self.page_as_obj_id else self.page return f"[{page} /XYZ {left} {top} {self.zoom}]"