Merge branch validator into master

This commit is contained in:
Vojtěch Káně 2021-01-23 02:43:14 +01:00
commit 31e45d7194
7 changed files with 162 additions and 0 deletions

View File

@ -15,5 +15,26 @@
'';
};
packages.x86_64-linux.validator-dev =
let
geneea_sdk = pkgs.python38Packages.buildPythonPackage rec {
pname = "geneea-nlp-client";
version = "1.2.0";
src = pkgs.python38Packages.fetchPypi {
inherit pname version;
sha256 = "Q0fYD3V0NbUOItpCwA6ExIy7sIcQxpfqr1aPSXd4+cc=";
};
propagatedBuildInputs = with pkgs.python38Packages; [ requests retrying ];
doCheck = false;
};
in
pkgs.mkShell {
buildInputs = [ pkgs.python38 pkgs.python38Packages.pip geneea_sdk ];
shellHook = ''
read -p 'Insert Geneea API key:' -r -s geneea_api_key
export geneea_api_key
'';
};
};
}

2
validator/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
env/
__pycache__/

31
validator/geneea.py Executable file
View File

@ -0,0 +1,31 @@
#!/usr/bin/env python
import os
from geneeanlpclient import g3
def getKey() -> str:
key = os.getenv("geneea_api_key")
if key is None:
raise ValueError("$geneea_api_key env var was not set")
return key
def doTheTest():
builder = g3.Request.Builder(analyses=[g3.AnalysisType.ALL], returnMentions=True, returnItemSentiment=True, domain=g3.Domain.MEDIA, textType="Business", referenceDate="2015-03-14", diacritization="redo", language="cs")
with g3.Client.create(userKey=getKey()) as analyzer:
result = analyzer.analyze(
builder.build(id=str(1), text='Koláč s rozinkami panu Karlovi moc chutnal.', language="cs"))
# print(result)
# for t in result.tokens:
# print(f'{t} lemma "{t.lemma}"')
for r in result.relations:
# print(r)
print(f'{r.type} {r.textRepr} - {r.name}')
if __name__ == '__main__':
doTheTest()

View File

@ -0,0 +1,22 @@
certifi==2020.12.5
chardet==4.0.0
click==7.1.2
docx2txt==0.8
Flask==1.1.2
geneea-nlp-client==1.2.0
idna==2.10
itsdangerous==1.1.0
Jinja2==2.11.2
joblib==1.0.0
MarkupSafe==1.1.1
nltk==3.5
pdftotext==2.1.5
Pillow==8.1.0
pytesseract==0.3.7
regex==2020.11.13
requests==2.25.1
retrying==1.3.3
six==1.15.0
tqdm==4.56.0
urllib3==1.26.2
Werkzeug==1.0.1

24
validator/server.py Normal file
View File

@ -0,0 +1,24 @@
from flask import Flask, render_template, request, jsonify
import json
import validator
app = Flask(__name__)
@app.route('/')
def index():
return render_template('index.html')
@app.route('/validator', methods=['GET', 'POST'])
def validate():
if request.method == 'POST':
print(request.files)
f = request.files['file']
text = validator.process_file(f)
result = validator.validate(text)
return jsonify(result)
else:
return 'Soubor byl zvalidován. TODO musím ověřit, jak byl zvalidován.' # TODO change
app.run()

View File

@ -0,0 +1,9 @@
<html>
<body>
<form action = "http://localhost:5000/validator" method = "POST"
enctype = "multipart/form-data">
<input type = "file" name = "file" /> <!-- TODO hláška-->
<input type = "submit"/>
</form>
</body>
</html>

53
validator/validator.py Normal file
View File

@ -0,0 +1,53 @@
import os
import pdftotext
import docx2txt
import pytesseract
from PIL import Image
def process_jpg(f):
text = pytesseract.image_to_string(Image.open(f), lang="ces")
return text
def process_pdf(f):
pdf = pdftotext.PDF(f)
aggregate = "\n\n".join(pdf)
return aggregate
def process_docx(f):
# TODO weird output with many spaces
return docx2txt.process(f)
def process_file(f):
# TODO proper file format distinguishing, not only by suffix?
_, ext = os.path.splitext(f.filename)
print(ext)
if ext == '.jpg':
return process_jpg(f)
elif ext == '.pdf':
return process_pdf(f)
elif ext == '.docx':
return process_docx(f)
def validate_court(lawsuit):
pass
def validate_accuser(lawsuit):
pass
# hard to implement
def validate_topic():
pass
# also hard to implement
def validate_intent():
pass
def validate_signature():
pass
def validate_date():
pass
def validate(text):
pass