Module `phc.easy.frame`

Expand source code

from toolz import curry
from typing import Callable, List, Tuple

import re
import pandas as pd

from phc.easy.codeable import Codeable

TZ_REGEX = re.compile(r"[-+]\d{2}:?\d{2}Z?$")

CODE_COLUMNS = [
    "meta",
    "identifier",
    "extension",
    "telecom",
    "valueCodeableConcept",
    "code",
    "valueQuantity",
    "category",
    "target",
]
DATE_COLUMNS = [
    "dob",
    "birth_date",
    "birthDate",
    "deceasedDateTime",
    "effectiveDateTime",
    "meta.tag_lastUpdated",
]


def column_to_frame(frame: pd.DataFrame, column_name: str, expand_func):
    "Converts a column (if exists) to a data frame with multiple columns"
    if column_name in frame.columns:
        return expand_func(frame[column_name])

    return pd.DataFrame([])


class Frame:
    @staticmethod
    @curry
    def _find_index_of_similar(columns: List[str], column_name: str):
        "Find sort order by original frame column names"
        MAX_INDEX = len(columns)

        return next(
            filter(
                lambda pair: pair[1] in column_name,
                # Start from reverse end since later columns might be longer
                reversed(list(enumerate(columns))),
            ),
            (MAX_INDEX, None),
        )[0]

    @staticmethod
    def codeable_like_column_expander(column_name: str):
        """Codeable expansion with prefix for passing to Frame.expand#custom_columns"""

        def _expander(column):
            return Codeable.expand_column(column).add_prefix(f"{column_name}.")

        return (column_name, _expander)

    @staticmethod
    def expand(
        frame: pd.DataFrame,
        code_columns: List[str] = [],
        date_columns: List[str] = [],
        custom_columns: List[
            Tuple[str, Callable[[pd.Series], pd.DataFrame]]
        ] = [],
    ):
        """Expand a data frame with FHIR codes, nested JSON structures, etc into a full,
        tabular data frame that can much more easily be wrangled

        Attributes
        ----------
        frame : pd.DataFrame
            The data frame to expand

        code_columns : List[str]
            The list of column names that contain code-like data (e.g. FHIR dictionaries)

        date_columns : List[str]
            The list of column names that contain dates (may not able to parse but might)

        custom_columns : List[Tuple[str, Callable[[pd.Series], pd.DataFrame]]]
            A list of tuples with the column name and a function that expands a
            column to a data frame. This will get merged index-wise into the
            combined frame

        """
        all_code_columns = [*CODE_COLUMNS, *code_columns]
        all_date_columns = [*DATE_COLUMNS, *date_columns]

        codeable_column_names = [
            key for key in all_code_columns if key in frame.columns
        ]

        custom_names = [
            key for key, _func in custom_columns if key in frame.columns
        ]

        code_frames = [
            (Codeable.expand_column(frame[col_name]).add_prefix(f"{col_name}."))
            for col_name in codeable_column_names
        ]

        columns = [
            frame.drop([*codeable_column_names, *custom_names], axis=1),
            *[
                (column_to_frame(frame, key, func))
                for key, func in custom_columns
            ],
            *code_frames,
        ]

        combined = pd.concat(columns, axis=1)

        date_column_names = list(
            filter(lambda k: k in combined.columns, all_date_columns)
        )

        # Mutate data frame to parse date columns
        for column_key in date_column_names:
            local_key = f"{column_key}.local"
            tz_key = f"{column_key}.tz"

            try:
                utc = pd.to_datetime(combined[column_key], utc=True)

                # Cleverness: Use regex to remove TZ and parse as utc=True to
                # produce local datetime. The column name will have ".local" as
                # suffix so it'll be clear what's happening.
                localized = pd.to_datetime(
                    combined[column_key].str.replace(TZ_REGEX, ""), utc=True
                )
            except pd.errors.OutOfBoundsDatetime as ex:
                print(
                    "[WARNING]: OutOfBoundsDatetime encountered. Casting to NaT.",
                    ex,
                )
                utc = pd.to_datetime(
                    combined[column_key], utc=True, errors="coerce"
                )
                localized = pd.to_datetime(
                    combined[column_key].str.replace(TZ_REGEX, ""),
                    utc=True,
                    errors="coerce",
                )

            combined[tz_key] = (localized - utc).dt.total_seconds() / 3600
            combined[local_key] = localized

        # Drop duplicate columns (nicety for same transform applied to cache)
        # Sort columns by original order (where possible)
        return combined.loc[:, ~combined.columns.duplicated()].reindex(
            sorted(
                [
                    c
                    for c in combined.columns.unique()
                    if c not in date_column_names
                ],
                key=Frame._find_index_of_similar(frame.columns),
            ),
            axis="columns",
        )

Functions

def column_to_frame(frame: pandas.core.frame.DataFrame, column_name: str, expand_func)

Converts a column (if exists) to a data frame with multiple columns

Expand source code

def column_to_frame(frame: pd.DataFrame, column_name: str, expand_func):
    "Converts a column (if exists) to a data frame with multiple columns"
    if column_name in frame.columns:
        return expand_func(frame[column_name])

    return pd.DataFrame([])

Classes

class Frame

Expand source code

class Frame:
    @staticmethod
    @curry
    def _find_index_of_similar(columns: List[str], column_name: str):
        "Find sort order by original frame column names"
        MAX_INDEX = len(columns)

        return next(
            filter(
                lambda pair: pair[1] in column_name,
                # Start from reverse end since later columns might be longer
                reversed(list(enumerate(columns))),
            ),
            (MAX_INDEX, None),
        )[0]

    @staticmethod
    def codeable_like_column_expander(column_name: str):
        """Codeable expansion with prefix for passing to Frame.expand#custom_columns"""

        def _expander(column):
            return Codeable.expand_column(column).add_prefix(f"{column_name}.")

        return (column_name, _expander)

    @staticmethod
    def expand(
        frame: pd.DataFrame,
        code_columns: List[str] = [],
        date_columns: List[str] = [],
        custom_columns: List[
            Tuple[str, Callable[[pd.Series], pd.DataFrame]]
        ] = [],
    ):
        """Expand a data frame with FHIR codes, nested JSON structures, etc into a full,
        tabular data frame that can much more easily be wrangled

        Attributes
        ----------
        frame : pd.DataFrame
            The data frame to expand

        code_columns : List[str]
            The list of column names that contain code-like data (e.g. FHIR dictionaries)

        date_columns : List[str]
            The list of column names that contain dates (may not able to parse but might)

        custom_columns : List[Tuple[str, Callable[[pd.Series], pd.DataFrame]]]
            A list of tuples with the column name and a function that expands a
            column to a data frame. This will get merged index-wise into the
            combined frame

        """
        all_code_columns = [*CODE_COLUMNS, *code_columns]
        all_date_columns = [*DATE_COLUMNS, *date_columns]

        codeable_column_names = [
            key for key in all_code_columns if key in frame.columns
        ]

        custom_names = [
            key for key, _func in custom_columns if key in frame.columns
        ]

        code_frames = [
            (Codeable.expand_column(frame[col_name]).add_prefix(f"{col_name}."))
            for col_name in codeable_column_names
        ]

        columns = [
            frame.drop([*codeable_column_names, *custom_names], axis=1),
            *[
                (column_to_frame(frame, key, func))
                for key, func in custom_columns
            ],
            *code_frames,
        ]

        combined = pd.concat(columns, axis=1)

        date_column_names = list(
            filter(lambda k: k in combined.columns, all_date_columns)
        )

        # Mutate data frame to parse date columns
        for column_key in date_column_names:
            local_key = f"{column_key}.local"
            tz_key = f"{column_key}.tz"

            try:
                utc = pd.to_datetime(combined[column_key], utc=True)

                # Cleverness: Use regex to remove TZ and parse as utc=True to
                # produce local datetime. The column name will have ".local" as
                # suffix so it'll be clear what's happening.
                localized = pd.to_datetime(
                    combined[column_key].str.replace(TZ_REGEX, ""), utc=True
                )
            except pd.errors.OutOfBoundsDatetime as ex:
                print(
                    "[WARNING]: OutOfBoundsDatetime encountered. Casting to NaT.",
                    ex,
                )
                utc = pd.to_datetime(
                    combined[column_key], utc=True, errors="coerce"
                )
                localized = pd.to_datetime(
                    combined[column_key].str.replace(TZ_REGEX, ""),
                    utc=True,
                    errors="coerce",
                )

            combined[tz_key] = (localized - utc).dt.total_seconds() / 3600
            combined[local_key] = localized

        # Drop duplicate columns (nicety for same transform applied to cache)
        # Sort columns by original order (where possible)
        return combined.loc[:, ~combined.columns.duplicated()].reindex(
            sorted(
                [
                    c
                    for c in combined.columns.unique()
                    if c not in date_column_names
                ],
                key=Frame._find_index_of_similar(frame.columns),
            ),
            axis="columns",
        )

Static methods

def codeable_like_column_expander(column_name: str)

Codeable expansion with prefix for passing to Frame.expand#custom_columns

Expand source code

@staticmethod
def codeable_like_column_expander(column_name: str):
    """Codeable expansion with prefix for passing to Frame.expand#custom_columns"""

    def _expander(column):
        return Codeable.expand_column(column).add_prefix(f"{column_name}.")

    return (column_name, _expander)

def expand(frame: pandas.core.frame.DataFrame, code_columns: List[str] = [], date_columns: List[str] = [], custom_columns: List[Tuple[str, Callable[[pandas.core.series.Series], pandas.core.frame.DataFrame]]] = [])

Expand a data frame with FHIR codes, nested JSON structures, etc into a full, tabular data frame that can much more easily be wrangled

Attributes

frame : pd.DataFrame: The data frame to expand
code_columns : List[str]: The list of column names that contain code-like data (e.g. FHIR dictionaries)
date_columns : List[str]: The list of column names that contain dates (may not able to parse but might)
custom_columns : List[Tuple[str, Callable[[pd.Series], pd.DataFrame]]]: A list of tuples with the column name and a function that expands a column to a data frame. This will get merged index-wise into the combined frame

Expand source code

@staticmethod
def expand(
    frame: pd.DataFrame,
    code_columns: List[str] = [],
    date_columns: List[str] = [],
    custom_columns: List[
        Tuple[str, Callable[[pd.Series], pd.DataFrame]]
    ] = [],
):
    """Expand a data frame with FHIR codes, nested JSON structures, etc into a full,
    tabular data frame that can much more easily be wrangled

    Attributes
    ----------
    frame : pd.DataFrame
        The data frame to expand

    code_columns : List[str]
        The list of column names that contain code-like data (e.g. FHIR dictionaries)

    date_columns : List[str]
        The list of column names that contain dates (may not able to parse but might)

    custom_columns : List[Tuple[str, Callable[[pd.Series], pd.DataFrame]]]
        A list of tuples with the column name and a function that expands a
        column to a data frame. This will get merged index-wise into the
        combined frame

    """
    all_code_columns = [*CODE_COLUMNS, *code_columns]
    all_date_columns = [*DATE_COLUMNS, *date_columns]

    codeable_column_names = [
        key for key in all_code_columns if key in frame.columns
    ]

    custom_names = [
        key for key, _func in custom_columns if key in frame.columns
    ]

    code_frames = [
        (Codeable.expand_column(frame[col_name]).add_prefix(f"{col_name}."))
        for col_name in codeable_column_names
    ]

    columns = [
        frame.drop([*codeable_column_names, *custom_names], axis=1),
        *[
            (column_to_frame(frame, key, func))
            for key, func in custom_columns
        ],
        *code_frames,
    ]

    combined = pd.concat(columns, axis=1)

    date_column_names = list(
        filter(lambda k: k in combined.columns, all_date_columns)
    )

    # Mutate data frame to parse date columns
    for column_key in date_column_names:
        local_key = f"{column_key}.local"
        tz_key = f"{column_key}.tz"

        try:
            utc = pd.to_datetime(combined[column_key], utc=True)

            # Cleverness: Use regex to remove TZ and parse as utc=True to
            # produce local datetime. The column name will have ".local" as
            # suffix so it'll be clear what's happening.
            localized = pd.to_datetime(
                combined[column_key].str.replace(TZ_REGEX, ""), utc=True
            )
        except pd.errors.OutOfBoundsDatetime as ex:
            print(
                "[WARNING]: OutOfBoundsDatetime encountered. Casting to NaT.",
                ex,
            )
            utc = pd.to_datetime(
                combined[column_key], utc=True, errors="coerce"
            )
            localized = pd.to_datetime(
                combined[column_key].str.replace(TZ_REGEX, ""),
                utc=True,
                errors="coerce",
            )

        combined[tz_key] = (localized - utc).dt.total_seconds() / 3600
        combined[local_key] = localized

    # Drop duplicate columns (nicety for same transform applied to cache)
    # Sort columns by original order (where possible)
    return combined.loc[:, ~combined.columns.duplicated()].reindex(
        sorted(
            [
                c
                for c in combined.columns.unique()
                if c not in date_column_names
            ],
            key=Frame._find_index_of_similar(frame.columns),
        ),
        axis="columns",
    )