Add binary pdf loading

This commit is contained in:
František Kmječ 2021-01-23 09:21:19 +01:00
parent 8c26a6c3cc
commit b9982b6d0b
2 changed files with 17 additions and 2 deletions

View File

@ -20,4 +20,4 @@ six==1.15.0
tqdm==4.56.0
urllib3==1.26.2
Werkzeug==1.0.1
flask-cors=3.0.10
flask-cors==3.0.10

View File

@ -5,6 +5,9 @@ import pytesseract
from PIL import Image
import regex as re
from typing import Union, Tuple, Literal
from pdf2image import convert_from_bytes
PDF_CHARACTER_THRESHOLD = 10
def process_jpg(f):
text = pytesseract.image_to_string(Image.open(f), lang="ces")
@ -13,7 +16,19 @@ def process_jpg(f):
def process_pdf(f):
pdf = pdftotext.PDF(f)
aggregate = "\n\n".join(pdf)
count = 0
for a in aggregate:
if a.isalnum():
count += 1
if count > PDF_CHARACTER_THRESHOLD:
return aggregate
else:
f.seek(0, 0)
images = convert_from_bytes(f.read())
text = []
for image in images:
text.append(pytesseract.image_to_string(image, lang="ces"))
return "\n\n".join(text)
def process_docx(f):
# TODO weird output with many spaces