Add binary pdf loading

2021-01-23 09:21:19 +01:00 · 2021-01-23 09:21:19 +01:00 · b9982b6d0b
commit b9982b6d0b
parent 8c26a6c3cc
2 changed files with 17 additions and 2 deletions
--- a/validator/requirements.txt
+++ b/validator/requirements.txt
@ -20,4 +20,4 @@ six==1.15.0
 tqdm==4.56.0
 urllib3==1.26.2
 Werkzeug==1.0.1
-flask-cors=3.0.10
+flask-cors==3.0.10
--- a/validator/validator.py
+++ b/validator/validator.py
@ -5,6 +5,9 @@ import pytesseract
 from PIL import Image
 import regex as re
 from typing import Union, Tuple, Literal
+from pdf2image import convert_from_bytes
+
+PDF_CHARACTER_THRESHOLD = 10

 def process_jpg(f):
    text = pytesseract.image_to_string(Image.open(f), lang="ces")
@ -13,7 +16,19 @@ def process_jpg(f):
 def process_pdf(f):
    pdf = pdftotext.PDF(f)
    aggregate = "\n\n".join(pdf)
+    count = 0
+    for a in aggregate:
+        if a.isalnum():
+            count += 1
+    if count > PDF_CHARACTER_THRESHOLD:
        return aggregate
+    else:
+        f.seek(0, 0)
+        images = convert_from_bytes(f.read())
+        text = []
+        for image in images:
+            text.append(pytesseract.image_to_string(image, lang="ces"))
+        return "\n\n".join(text)

 def process_docx(f):
    # TODO weird output with many spaces