Module phc.easy.ocr.block
Expand source code
import os
import pandas as pd
import toolz.curried as c
from funcy import iffy, isa
from phc.easy.auth import Auth
from phc.easy.frame import Frame
from phc.easy.ocr.document import Document
from phc.services import Files
from toolz import curry, get_in, partial, pipe
class Block:
@staticmethod
def get_data_frame(
document_id: str, raw: bool = False, auth_args: Auth = Auth.shared()
):
auth = Auth(auth_args)
document = Document.get(document_id, auth_args=auth_args)
file_id = pipe(
document.get("content", []),
c.filter(
lambda c: c.get("format", {}).get("code") == "ocr-text-file-id"
),
c.first,
c.get("attachment", default={}),
c.get("url"),
iffy(isa(str), lambda url: url.split("/")[-1]),
)
if file_id is None:
raise ValueError(
f"No block file found for document: '{document_id}'"
)
files = Files(auth.session())
filename = files.download(file_id, "/tmp/")
frame = pd.read_json(filename, lines=True)
os.remove(filename)
if raw or len(frame) == 0:
return frame
return Block.sort(
frame.drop(["Geometry"], axis=1)
.join(pd.json_normalize(frame.Geometry))
.pipe(
partial(
Frame.expand,
custom_columns=[
Frame.codeable_like_column_expander("Polygon")
],
)
)
.set_index("Id")
)
@staticmethod
def sort(frame: pd.DataFrame):
"""Sort a textract block frame by getting the proper order of the ids.
Starts with the pages and recursively gets the child ids for each descendent
so that the first three rows should (almost) always be PAGE -> LINE -> WORD
where all of the words of that line follow it.
"""
return (
frame.loc[
Block.recursive_get_child_ids(
frame,
frame.sort_values("Page")
.query("BlockType == 'PAGE'")
.index,
)
]
.reset_index()
.drop_duplicates(subset=["Id"])
.set_index("Id")
)
@staticmethod
@curry
def recursive_get_child_ids(frame: pd.DataFrame, ids: list):
return pipe(
ids,
c.mapcat(
lambda an_id: [
an_id,
*pipe(
get_in([0, "Ids"], frame.loc[an_id].Relationships, []),
Block.recursive_get_child_ids(frame),
),
]
),
list,
)
Classes
class Block
-
Expand source code
class Block: @staticmethod def get_data_frame( document_id: str, raw: bool = False, auth_args: Auth = Auth.shared() ): auth = Auth(auth_args) document = Document.get(document_id, auth_args=auth_args) file_id = pipe( document.get("content", []), c.filter( lambda c: c.get("format", {}).get("code") == "ocr-text-file-id" ), c.first, c.get("attachment", default={}), c.get("url"), iffy(isa(str), lambda url: url.split("/")[-1]), ) if file_id is None: raise ValueError( f"No block file found for document: '{document_id}'" ) files = Files(auth.session()) filename = files.download(file_id, "/tmp/") frame = pd.read_json(filename, lines=True) os.remove(filename) if raw or len(frame) == 0: return frame return Block.sort( frame.drop(["Geometry"], axis=1) .join(pd.json_normalize(frame.Geometry)) .pipe( partial( Frame.expand, custom_columns=[ Frame.codeable_like_column_expander("Polygon") ], ) ) .set_index("Id") ) @staticmethod def sort(frame: pd.DataFrame): """Sort a textract block frame by getting the proper order of the ids. Starts with the pages and recursively gets the child ids for each descendent so that the first three rows should (almost) always be PAGE -> LINE -> WORD where all of the words of that line follow it. """ return ( frame.loc[ Block.recursive_get_child_ids( frame, frame.sort_values("Page") .query("BlockType == 'PAGE'") .index, ) ] .reset_index() .drop_duplicates(subset=["Id"]) .set_index("Id") ) @staticmethod @curry def recursive_get_child_ids(frame: pd.DataFrame, ids: list): return pipe( ids, c.mapcat( lambda an_id: [ an_id, *pipe( get_in([0, "Ids"], frame.loc[an_id].Relationships, []), Block.recursive_get_child_ids(frame), ), ] ), list, )
Static methods
def get_data_frame(document_id: str, raw: bool = False, auth_args: Auth = <phc.easy.auth.Auth object>)
-
Expand source code
@staticmethod def get_data_frame( document_id: str, raw: bool = False, auth_args: Auth = Auth.shared() ): auth = Auth(auth_args) document = Document.get(document_id, auth_args=auth_args) file_id = pipe( document.get("content", []), c.filter( lambda c: c.get("format", {}).get("code") == "ocr-text-file-id" ), c.first, c.get("attachment", default={}), c.get("url"), iffy(isa(str), lambda url: url.split("/")[-1]), ) if file_id is None: raise ValueError( f"No block file found for document: '{document_id}'" ) files = Files(auth.session()) filename = files.download(file_id, "/tmp/") frame = pd.read_json(filename, lines=True) os.remove(filename) if raw or len(frame) == 0: return frame return Block.sort( frame.drop(["Geometry"], axis=1) .join(pd.json_normalize(frame.Geometry)) .pipe( partial( Frame.expand, custom_columns=[ Frame.codeable_like_column_expander("Polygon") ], ) ) .set_index("Id") )
def recursive_get_child_ids(frame: pandas.core.frame.DataFrame = '__no__default__', ids: list = '__no__default__')
def sort(frame: pandas.core.frame.DataFrame)
-
Sort a textract block frame by getting the proper order of the ids.
Starts with the pages and recursively gets the child ids for each descendent so that the first three rows should (almost) always be PAGE -> LINE -> WORD where all of the words of that line follow it.
Expand source code
@staticmethod def sort(frame: pd.DataFrame): """Sort a textract block frame by getting the proper order of the ids. Starts with the pages and recursively gets the child ids for each descendent so that the first three rows should (almost) always be PAGE -> LINE -> WORD where all of the words of that line follow it. """ return ( frame.loc[ Block.recursive_get_child_ids( frame, frame.sort_values("Page") .query("BlockType == 'PAGE'") .index, ) ] .reset_index() .drop_duplicates(subset=["Id"]) .set_index("Id") )