Module phc.util.csv_writer

Expand source code
import os
import re
import pandas as pd


class CSVWriter:
    """Class for progressively writing batches of pandas data frames to a CSV
    file where additional columns may be added in subsequent writes
    """

    def __init__(self, filename: str):
        self.filename = filename
        self.bak_filename = filename + ".bak"
        self.batch_filename = filename + ".batch.bak"

    def write(self, frame: pd.DataFrame):
        """Write a data frame to an existing CSV file without loading the entire
        file into memory
        """

        # Remove newlines from column names
        frame.columns = [
            re.sub(r"[\t\n]", "", c) for c in frame.columns.tolist()
        ]

        if not os.path.exists(self.filename):
            frame.to_csv(
                self.filename, date_format="%Y-%m-%dT%H:%M:%S%z", index=False
            )
            return

        self._copy_to_backup_file_without_header()

        original_columns = self._columns()
        new_columns = [c for c in frame.columns if c not in original_columns]
        columns_not_in_this_batch = [
            c for c in original_columns if c not in frame.columns
        ]

        # Create frame with all columns (order doesn't matter here)
        superset_frame = pd.concat(
            [frame, pd.DataFrame(None, columns=columns_not_in_this_batch)]
        )

        # Create ordered list to select order of values when writing to the file
        ordered_columns = [*original_columns, *new_columns]

        superset_frame[ordered_columns].to_csv(self.batch_filename, index=False)

        self._finalize()

    def _columns(self):
        return pd.read_csv(self.filename, nrows=0).columns.tolist()

    def _copy_to_backup_file_without_header(self):
        os.system(f"sed 1,1d {self.filename} > {self.bak_filename}")

    def _finalize(self):
        os.system(
            f"""
          head -n 1 {self.batch_filename} > {self.filename} && \
            cat {self.bak_filename} >> {self.filename} && \
            sed 1,1d {self.batch_filename} >> {self.filename} && \
            rm {self.batch_filename} {self.bak_filename}
        """
        )

Classes

class CSVWriter (filename: str)

Class for progressively writing batches of pandas data frames to a CSV file where additional columns may be added in subsequent writes

Expand source code
class CSVWriter:
    """Class for progressively writing batches of pandas data frames to a CSV
    file where additional columns may be added in subsequent writes
    """

    def __init__(self, filename: str):
        self.filename = filename
        self.bak_filename = filename + ".bak"
        self.batch_filename = filename + ".batch.bak"

    def write(self, frame: pd.DataFrame):
        """Write a data frame to an existing CSV file without loading the entire
        file into memory
        """

        # Remove newlines from column names
        frame.columns = [
            re.sub(r"[\t\n]", "", c) for c in frame.columns.tolist()
        ]

        if not os.path.exists(self.filename):
            frame.to_csv(
                self.filename, date_format="%Y-%m-%dT%H:%M:%S%z", index=False
            )
            return

        self._copy_to_backup_file_without_header()

        original_columns = self._columns()
        new_columns = [c for c in frame.columns if c not in original_columns]
        columns_not_in_this_batch = [
            c for c in original_columns if c not in frame.columns
        ]

        # Create frame with all columns (order doesn't matter here)
        superset_frame = pd.concat(
            [frame, pd.DataFrame(None, columns=columns_not_in_this_batch)]
        )

        # Create ordered list to select order of values when writing to the file
        ordered_columns = [*original_columns, *new_columns]

        superset_frame[ordered_columns].to_csv(self.batch_filename, index=False)

        self._finalize()

    def _columns(self):
        return pd.read_csv(self.filename, nrows=0).columns.tolist()

    def _copy_to_backup_file_without_header(self):
        os.system(f"sed 1,1d {self.filename} > {self.bak_filename}")

    def _finalize(self):
        os.system(
            f"""
          head -n 1 {self.batch_filename} > {self.filename} && \
            cat {self.bak_filename} >> {self.filename} && \
            sed 1,1d {self.batch_filename} >> {self.filename} && \
            rm {self.batch_filename} {self.bak_filename}
        """
        )

Methods

def write(self, frame: pandas.core.frame.DataFrame)

Write a data frame to an existing CSV file without loading the entire file into memory

Expand source code
def write(self, frame: pd.DataFrame):
    """Write a data frame to an existing CSV file without loading the entire
    file into memory
    """

    # Remove newlines from column names
    frame.columns = [
        re.sub(r"[\t\n]", "", c) for c in frame.columns.tolist()
    ]

    if not os.path.exists(self.filename):
        frame.to_csv(
            self.filename, date_format="%Y-%m-%dT%H:%M:%S%z", index=False
        )
        return

    self._copy_to_backup_file_without_header()

    original_columns = self._columns()
    new_columns = [c for c in frame.columns if c not in original_columns]
    columns_not_in_this_batch = [
        c for c in original_columns if c not in frame.columns
    ]

    # Create frame with all columns (order doesn't matter here)
    superset_frame = pd.concat(
        [frame, pd.DataFrame(None, columns=columns_not_in_this_batch)]
    )

    # Create ordered list to select order of values when writing to the file
    ordered_columns = [*original_columns, *new_columns]

    superset_frame[ordered_columns].to_csv(self.batch_filename, index=False)

    self._finalize()