Merge branch validator into master
This commit is contained in:
commit
31e45d7194
21
flake.nix
21
flake.nix
@ -15,5 +15,26 @@
|
||||
'';
|
||||
};
|
||||
|
||||
packages.x86_64-linux.validator-dev =
|
||||
let
|
||||
geneea_sdk = pkgs.python38Packages.buildPythonPackage rec {
|
||||
pname = "geneea-nlp-client";
|
||||
version = "1.2.0";
|
||||
src = pkgs.python38Packages.fetchPypi {
|
||||
inherit pname version;
|
||||
sha256 = "Q0fYD3V0NbUOItpCwA6ExIy7sIcQxpfqr1aPSXd4+cc=";
|
||||
};
|
||||
propagatedBuildInputs = with pkgs.python38Packages; [ requests retrying ];
|
||||
doCheck = false;
|
||||
};
|
||||
in
|
||||
pkgs.mkShell {
|
||||
buildInputs = [ pkgs.python38 pkgs.python38Packages.pip geneea_sdk ];
|
||||
shellHook = ''
|
||||
read -p 'Insert Geneea API key:' -r -s geneea_api_key
|
||||
export geneea_api_key
|
||||
'';
|
||||
};
|
||||
|
||||
};
|
||||
}
|
||||
|
2
validator/.gitignore
vendored
Normal file
2
validator/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
env/
|
||||
__pycache__/
|
31
validator/geneea.py
Executable file
31
validator/geneea.py
Executable file
@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import os
|
||||
from geneeanlpclient import g3
|
||||
|
||||
|
||||
def getKey() -> str:
|
||||
key = os.getenv("geneea_api_key")
|
||||
if key is None:
|
||||
raise ValueError("$geneea_api_key env var was not set")
|
||||
return key
|
||||
|
||||
|
||||
def doTheTest():
|
||||
builder = g3.Request.Builder(analyses=[g3.AnalysisType.ALL], returnMentions=True, returnItemSentiment=True, domain=g3.Domain.MEDIA, textType="Business", referenceDate="2015-03-14", diacritization="redo", language="cs")
|
||||
with g3.Client.create(userKey=getKey()) as analyzer:
|
||||
result = analyzer.analyze(
|
||||
builder.build(id=str(1), text='Koláč s rozinkami panu Karlovi moc chutnal.', language="cs"))
|
||||
|
||||
# print(result)
|
||||
|
||||
# for t in result.tokens:
|
||||
# print(f'{t} – lemma "{t.lemma}"')
|
||||
|
||||
for r in result.relations:
|
||||
# print(r)
|
||||
print(f'{r.type} {r.textRepr} - {r.name}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
doTheTest()
|
22
validator/requirements.txt
Normal file
22
validator/requirements.txt
Normal file
@ -0,0 +1,22 @@
|
||||
certifi==2020.12.5
|
||||
chardet==4.0.0
|
||||
click==7.1.2
|
||||
docx2txt==0.8
|
||||
Flask==1.1.2
|
||||
geneea-nlp-client==1.2.0
|
||||
idna==2.10
|
||||
itsdangerous==1.1.0
|
||||
Jinja2==2.11.2
|
||||
joblib==1.0.0
|
||||
MarkupSafe==1.1.1
|
||||
nltk==3.5
|
||||
pdftotext==2.1.5
|
||||
Pillow==8.1.0
|
||||
pytesseract==0.3.7
|
||||
regex==2020.11.13
|
||||
requests==2.25.1
|
||||
retrying==1.3.3
|
||||
six==1.15.0
|
||||
tqdm==4.56.0
|
||||
urllib3==1.26.2
|
||||
Werkzeug==1.0.1
|
24
validator/server.py
Normal file
24
validator/server.py
Normal file
@ -0,0 +1,24 @@
|
||||
from flask import Flask, render_template, request, jsonify
|
||||
import json
|
||||
import validator
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
@app.route('/')
|
||||
def index():
|
||||
return render_template('index.html')
|
||||
|
||||
@app.route('/validator', methods=['GET', 'POST'])
|
||||
def validate():
|
||||
if request.method == 'POST':
|
||||
print(request.files)
|
||||
f = request.files['file']
|
||||
|
||||
text = validator.process_file(f)
|
||||
result = validator.validate(text)
|
||||
return jsonify(result)
|
||||
else:
|
||||
return 'Soubor byl zvalidován. TODO musím ověřit, jak byl zvalidován.' # TODO change
|
||||
|
||||
|
||||
app.run()
|
9
validator/templates/index.html
Normal file
9
validator/templates/index.html
Normal file
@ -0,0 +1,9 @@
|
||||
<html>
|
||||
<body>
|
||||
<form action = "http://localhost:5000/validator" method = "POST"
|
||||
enctype = "multipart/form-data">
|
||||
<input type = "file" name = "file" /> <!-- TODO hláška-->
|
||||
<input type = "submit"/>
|
||||
</form>
|
||||
</body>
|
||||
</html>
|
53
validator/validator.py
Normal file
53
validator/validator.py
Normal file
@ -0,0 +1,53 @@
|
||||
import os
|
||||
import pdftotext
|
||||
import docx2txt
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
|
||||
def process_jpg(f):
|
||||
text = pytesseract.image_to_string(Image.open(f), lang="ces")
|
||||
return text
|
||||
|
||||
def process_pdf(f):
|
||||
pdf = pdftotext.PDF(f)
|
||||
aggregate = "\n\n".join(pdf)
|
||||
return aggregate
|
||||
|
||||
def process_docx(f):
|
||||
# TODO weird output with many spaces
|
||||
return docx2txt.process(f)
|
||||
|
||||
|
||||
def process_file(f):
|
||||
# TODO proper file format distinguishing, not only by suffix?
|
||||
_, ext = os.path.splitext(f.filename)
|
||||
print(ext)
|
||||
if ext == '.jpg':
|
||||
return process_jpg(f)
|
||||
elif ext == '.pdf':
|
||||
return process_pdf(f)
|
||||
elif ext == '.docx':
|
||||
return process_docx(f)
|
||||
|
||||
def validate_court(lawsuit):
|
||||
pass
|
||||
|
||||
def validate_accuser(lawsuit):
|
||||
pass
|
||||
|
||||
# hard to implement
|
||||
def validate_topic():
|
||||
pass
|
||||
|
||||
# also hard to implement
|
||||
def validate_intent():
|
||||
pass
|
||||
|
||||
def validate_signature():
|
||||
pass
|
||||
|
||||
def validate_date():
|
||||
pass
|
||||
|
||||
def validate(text):
|
||||
pass
|
Loading…
Reference in New Issue
Block a user