Add binary pdf loading

This commit is contained in:
František Kmječ 2021-01-23 09:21:19 +01:00
parent 8c26a6c3cc
commit b9982b6d0b
2 changed files with 17 additions and 2 deletions

View File

@ -20,4 +20,4 @@ six==1.15.0
tqdm==4.56.0 tqdm==4.56.0
urllib3==1.26.2 urllib3==1.26.2
Werkzeug==1.0.1 Werkzeug==1.0.1
flask-cors=3.0.10 flask-cors==3.0.10

View File

@ -5,6 +5,9 @@ import pytesseract
from PIL import Image from PIL import Image
import regex as re import regex as re
from typing import Union, Tuple, Literal from typing import Union, Tuple, Literal
from pdf2image import convert_from_bytes
PDF_CHARACTER_THRESHOLD = 10
def process_jpg(f): def process_jpg(f):
text = pytesseract.image_to_string(Image.open(f), lang="ces") text = pytesseract.image_to_string(Image.open(f), lang="ces")
@ -13,7 +16,19 @@ def process_jpg(f):
def process_pdf(f): def process_pdf(f):
pdf = pdftotext.PDF(f) pdf = pdftotext.PDF(f)
aggregate = "\n\n".join(pdf) aggregate = "\n\n".join(pdf)
return aggregate count = 0
for a in aggregate:
if a.isalnum():
count += 1
if count > PDF_CHARACTER_THRESHOLD:
return aggregate
else:
f.seek(0, 0)
images = convert_from_bytes(f.read())
text = []
for image in images:
text.append(pytesseract.image_to_string(image, lang="ces"))
return "\n\n".join(text)
def process_docx(f): def process_docx(f):
# TODO weird output with many spaces # TODO weird output with many spaces