From d6ad798e96d93f348ab7a0f5987cda4847e7a83c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20K=C3=A1n=C4=9B?= Date: Sat, 23 Jan 2021 03:18:21 +0100 Subject: [PATCH] Basic court regex validator --- flake.nix | 2 +- validator/validator.py | 17 ++++++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/flake.nix b/flake.nix index 550e368..88903aa 100644 --- a/flake.nix +++ b/flake.nix @@ -37,7 +37,7 @@ }; in pkgs.mkShell { - buildInputs = with pkgs.python38Packages; [ pkgs.python38 pip geneea_sdk pytesseract pdftotext pillow docx2txt flask ]; + buildInputs = with pkgs.python38Packages; [ pkgs.python38 pip geneea_sdk pytesseract pdftotext pillow docx2txt flask regex ]; shellHook = '' read -p 'Insert Geneea API key:' -r -s geneea_api_key export geneea_api_key diff --git a/validator/validator.py b/validator/validator.py index 0047789..4a4f95f 100644 --- a/validator/validator.py +++ b/validator/validator.py @@ -3,6 +3,7 @@ import pdftotext import docx2txt import pytesseract from PIL import Image +import regex as re def process_jpg(f): text = pytesseract.image_to_string(Image.open(f), lang="ces") @@ -16,7 +17,7 @@ def process_pdf(f): def process_docx(f): # TODO weird output with many spaces return docx2txt.process(f) - + def process_file(f): # TODO proper file format distinguishing, not only by suffix? @@ -27,10 +28,11 @@ def process_file(f): elif ext == '.pdf': return process_pdf(f) elif ext == '.docx': - return process_docx(f) + return process_docx(f) -def validate_court(lawsuit): - pass +court_pat = re.compile(r"(okresní|krajský|vrchní|nejvyšší(\s+správní\S*)|ústavní)\S*\s+soud\S*(\s+ve?)?\s+(\S+)", flags=re.IGNORECASE) +def validate_court(lawsuit: str): + return court_pat.match(lawsuit) def validate_accuser(lawsuit): pass @@ -50,4 +52,9 @@ def validate_date(): pass def validate(text): - pass \ No newline at end of file + pass + +# debug +if __name__ == "__main__": + import sys + print(validate_court(str(sys.stdin.read())))