Add binary pdf loading

2021-01-23 09:21:19 +01:00 · 2021-01-23 09:21:19 +01:00 · b9982b6d0b
commit b9982b6d0b
parent 8c26a6c3cc
2 changed files with 17 additions and 2 deletions
--- a/validator/requirements.txt
+++ b/validator/requirements.txt
@ -20,4 +20,4 @@ six==1.15.0
 tqdm==4.56.0
 urllib3==1.26.2
 Werkzeug==1.0.1
-flask-cors=3.0.10
+flask-cors==3.0.10
--- a/validator/validator.py
+++ b/validator/validator.py
@ -5,6 +5,9 @@ import pytesseract
 from PIL import Image
 import regex as re
 from typing import Union, Tuple, Literal
 from pdf2image import convert_from_bytes
 PDF_CHARACTER_THRESHOLD = 10
 def process_jpg(f):
    text = pytesseract.image_to_string(Image.open(f), lang="ces")
@ -13,7 +16,19 @@ def process_jpg(f):
 def process_pdf(f):
    pdf = pdftotext.PDF(f)
    aggregate = "\n\n".join(pdf)
-    return aggregate
+    count = 0
    for a in aggregate:
        if a.isalnum():
            count += 1
    if count > PDF_CHARACTER_THRESHOLD:
        return aggregate
    else:
        f.seek(0, 0)
        images = convert_from_bytes(f.read())
        text = []
        for image in images:
            text.append(pytesseract.image_to_string(image, lang="ces"))
        return "\n\n".join(text)
 def process_docx(f):
    # TODO weird output with many spaces