#!/usr/bin/env python3 # Invoke Datalogics PDF Checker & parse its output # Purpose of this script: # * abort the validation pipeline with a non-zero error code if any check fails on a PDF sample # * aggregate all checks performed in a concise summary # * allow to ignore some errors considered harmless, listed in pdfchecker-ignore.json # USAGE: ./pdfchecker.py [$pdf_filepath] import sys from subprocess import check_output from scripts.checker_commons import aggregate, print_aggregated_report AGGREGATED_REPORT_FILEPATH = "pdfchecker-aggregated.json" IGNORE_WHITELIST_FILEPATH = "scripts/pdfchecker-ignore.json" CHECKS_DETAILS_URL = "https://dev.datalogics.com/pdf-checker/the-json-profile-file/description-of-json-profile-parameters/" UNPROCESSABLE_PDF_ERROR_LINE = "Unable to process document due to PDF Error" CHECKER_SUMMARY_END_LINE = "<<=CHECKER_SUMMARY_END=>>" def analyze_pdf_file(pdf_filepath): output = check_output( [ "PDF_Checker/pdfchecker", "--profile", "PDF_Checker/CheckerProfiles/everything.json", "--input", pdf_filepath, ] ).decode() report = parse_output(output) aggregate(pdf_filepath, report, AGGREGATED_REPORT_FILEPATH) def parse_output(output): """ Parse PDF Checker indented output into a dict-tree. Tree leaves are empty dicts. """ lines = output.splitlines() version = lines[0] if UNPROCESSABLE_PDF_ERROR_LINE in lines: return { "failure": UNPROCESSABLE_PDF_ERROR_LINE, "version": version, } assert CHECKER_SUMMARY_END_LINE in lines, "\n".join(lines) lines = lines[lines.index(CHECKER_SUMMARY_END_LINE) + 2 :] analysis = insert_indented(lines) return { "errors": [ error for section in analysis.values() for error in section.get("Errors:", {}).keys() if error != "None" ], "version": version, } def insert_indented(lines, node=None, depth=0, indent=0): if node is None: node = {} prev_node = None while lines: line = lines[0] if not line: lines.pop(0) continue line_indent = len(line) - len(line.lstrip()) text = line[line_indent:].rstrip() if line_indent >= indent and text in ( "Color Images", "Grayscale Images", "Monochrome Images", ): if depth > 1: # Leaving this branch of the tree after processing a "* Images" block return # Special case handled by creating a subnode for this "* Images" block: lines.pop(0) node[text] = {} insert_indented(lines, node[text], depth + 1, indent) continue if line_indent == indent: lines.pop(0) prev_node = node[text] = {} continue if line_indent > indent: if prev_node is None: # Case of more than 1 level of indentation, e.g. "How To Optimize:" section lines.pop(0) node[text] = {} continue assert ( prev_node is not None ), f"depth={depth} indent={indent} line_indent={line_indent}: {line}" insert_indented(lines, prev_node, depth + 1, indent + 4) continue return # line_indent < indent return node if __name__ == "__main__": if len(sys.argv) < 2: print_aggregated_report( AGGREGATED_REPORT_FILEPATH, CHECKS_DETAILS_URL, IGNORE_WHITELIST_FILEPATH ) elif len(sys.argv) > 2: print(sys.argv, file=sys.stderr) print("Exactly one argument must be passed to pdfchecker.py", file=sys.stderr) sys.exit(2) else: analyze_pdf_file(sys.argv[1])