diff --git a/validator/requirements.txt b/validator/requirements.txt index 2a4b212..986a06d 100644 --- a/validator/requirements.txt +++ b/validator/requirements.txt @@ -20,4 +20,4 @@ six==1.15.0 tqdm==4.56.0 urllib3==1.26.2 Werkzeug==1.0.1 -flask-cors=3.0.10 +flask-cors==3.0.10 diff --git a/validator/validator.py b/validator/validator.py index 430e167..7f8559b 100644 --- a/validator/validator.py +++ b/validator/validator.py @@ -5,6 +5,9 @@ import pytesseract from PIL import Image import regex as re from typing import Union, Tuple, Literal +from pdf2image import convert_from_bytes + +PDF_CHARACTER_THRESHOLD = 10 def process_jpg(f): text = pytesseract.image_to_string(Image.open(f), lang="ces") @@ -13,7 +16,19 @@ def process_jpg(f): def process_pdf(f): pdf = pdftotext.PDF(f) aggregate = "\n\n".join(pdf) - return aggregate + count = 0 + for a in aggregate: + if a.isalnum(): + count += 1 + if count > PDF_CHARACTER_THRESHOLD: + return aggregate + else: + f.seek(0, 0) + images = convert_from_bytes(f.read()) + text = [] + for image in images: + text.append(pytesseract.image_to_string(image, lang="ces")) + return "\n\n".join(text) def process_docx(f): # TODO weird output with many spaces