Basic court regex validator

This commit is contained in:
Vojtěch Káně 2021-01-23 03:18:21 +01:00
parent 957f5efb0c
commit d6ad798e96
2 changed files with 13 additions and 6 deletions

View File

@ -37,7 +37,7 @@
}; };
in in
pkgs.mkShell { pkgs.mkShell {
buildInputs = with pkgs.python38Packages; [ pkgs.python38 pip geneea_sdk pytesseract pdftotext pillow docx2txt flask ]; buildInputs = with pkgs.python38Packages; [ pkgs.python38 pip geneea_sdk pytesseract pdftotext pillow docx2txt flask regex ];
shellHook = '' shellHook = ''
read -p 'Insert Geneea API key:' -r -s geneea_api_key read -p 'Insert Geneea API key:' -r -s geneea_api_key
export geneea_api_key export geneea_api_key

View File

@ -3,6 +3,7 @@ import pdftotext
import docx2txt import docx2txt
import pytesseract import pytesseract
from PIL import Image from PIL import Image
import regex as re
def process_jpg(f): def process_jpg(f):
text = pytesseract.image_to_string(Image.open(f), lang="ces") text = pytesseract.image_to_string(Image.open(f), lang="ces")
@ -16,7 +17,7 @@ def process_pdf(f):
def process_docx(f): def process_docx(f):
# TODO weird output with many spaces # TODO weird output with many spaces
return docx2txt.process(f) return docx2txt.process(f)
def process_file(f): def process_file(f):
# TODO proper file format distinguishing, not only by suffix? # TODO proper file format distinguishing, not only by suffix?
@ -27,10 +28,11 @@ def process_file(f):
elif ext == '.pdf': elif ext == '.pdf':
return process_pdf(f) return process_pdf(f)
elif ext == '.docx': elif ext == '.docx':
return process_docx(f) return process_docx(f)
def validate_court(lawsuit): court_pat = re.compile(r"(okresní|krajský|vrchní|nejvyšší(\s+správní\S*)|ústavní)\S*\s+soud\S*(\s+ve?)?\s+(\S+)", flags=re.IGNORECASE)
pass def validate_court(lawsuit: str):
return court_pat.match(lawsuit)
def validate_accuser(lawsuit): def validate_accuser(lawsuit):
pass pass
@ -50,4 +52,9 @@ def validate_date():
pass pass
def validate(text): def validate(text):
pass pass
# debug
if __name__ == "__main__":
import sys
print(validate_court(str(sys.stdin.read())))