Add pdf to text

This commit is contained in:
František Kmječ 2021-01-22 21:53:52 +01:00
parent 3dee27732c
commit c51386f195
3 changed files with 20 additions and 3 deletions

View File

@ -9,6 +9,8 @@ Jinja2==2.11.2
joblib==1.0.0 joblib==1.0.0
MarkupSafe==1.1.1 MarkupSafe==1.1.1
nltk==3.5 nltk==3.5
pdftotext==2.1.5
PyPDF2==1.26.0
regex==2020.11.13 regex==2020.11.13
requests==2.25.1 requests==2.25.1
retrying==1.3.3 retrying==1.3.3

View File

@ -1,4 +1,4 @@
from flask import Flask, render_template, request from flask import Flask, render_template, request, jsonify
import json import json
import validator import validator

View File

@ -1,14 +1,29 @@
import os
import pdftotext
def process_jpg(f): def process_jpg(f):
pass pass
def process_pdf(f): def process_pdf(f):
pass pdf = pdftotext.PDF(f)
aggregate = "\n\n".join(pdf)
return aggregate
def process_docx(f): def process_docx(f):
pass pass
def process_file(f): def process_file(f):
pass # TODO proper file format distinguishing, not only by suffix?
_, ext = os.path.splitext(f.filename)
print(ext)
if ext == '.jpg':
return process_jpg(f)
elif ext == '.pdf':
return process_pdf(f)
elif ext == '.docx':
return process_docx(f)
def validate_court(lawsuit): def validate_court(lawsuit):
pass pass