Source code for utils.classes.pandas_dataframe

import pandas as pd


[docs]
class PandasDataframe:
    """
    Class responsible for the standardization and manipulation of Pandas DataFrames.

    Attributes:
        path (str):
            Absolute path to the CSV file.

        df (pandas.DataFrame):
            Data stored in the dataframe.

        list (list, optional):
            List of dataframes used for concatenation.

        dict (dict, optional):
            Dictionary used to create a dataframe.
    """

    def __init__(self, path, df, **kwargs):
        self.path = path
        self.df = df
        self.list = kwargs.get('list') or None
        self.dict = kwargs.get('dict') or None


[docs]
    def csv_to_df(self):
        """
        Reads a CSV file from the specified path and loads it into the dataframe.
        """
        self.df = pd.read_csv(self.path, sep=';', encoding='utf-8', index_col=None)

    

[docs]
    def df_to_csv(self, path):
        """
        Exports the current dataframe to a CSV file.

        Args:
            path (str):
                Destination path for the CSV file.
        """
        self.df.to_csv(path, sep=';', index=False)



[docs]
    def list_to_df(self):
        """
        Concatenates a list of dataframes into a single dataframe.
        """
        self.df = pd.concat(self.list, ignore_index=True)

    

[docs]
    def dict_to_df(self):
        """
        Converts the stored dictionary into a dataframe.
        """
        self.df = pd.DataFrame(self.dict)



[docs]
    def drop_column(self, column, direction):
        """
        Drops rows or columns from the dataframe.

        Args:
            column (str or list):
                Column name(s) or row label(s) to drop.

            direction (int):
                Axis to drop from (0 for rows, 1 for columns).
        """
        self.df = self.df.drop(column, axis=direction)



[docs]
    def query_date(self, start_date, end_date, column_name):
        """
        Filters rows between two dates based on a specified date column.

        Args:
            start_date (str or datetime):
                Start date for filtering.

            end_date (str or datetime):
                End date for filtering.

            column_name (str):
                Name of the date column.
        """
        self.df[column_name] = pd.to_datetime(self.df[column_name])

        self.df = self.df.loc[
            (self.df[column_name] >= start_date) &
            (self.df[column_name] <= end_date)
        ]



[docs]
    def query_element_in(self, column, collection):
        """
        Filters rows where column values are within a given collection.

        Args:
            column (str):
                Column name to filter.

            collection (list or set):
                Collection of values to match.
        """
        self.df = self.df[self.df[column].isin(collection)]

    

[docs]
    def query_date_and_element(self, start_date, end_date, date_column_name, investment_cnpj, investment_column_cnpj):
        """
        Filters rows based on both a date range and a specific element.

        Args:
            start_date (str or datetime):
                Start date for filtering.

            end_date (str or datetime):
                End date for filtering.

            date_column_name (str):
                Name of the date column.

            investment_cnpj (str):
                Value to filter in the investment column.

            investment_column_cnpj (str):
                Column name containing the investment identifier.

        Returns:
            pandas.DataFrame:
                Filtered dataframe.
        """
        new_df_reference = self.df

        new_df_reference = new_df_reference[
            new_df_reference[investment_column_cnpj].isin([investment_cnpj])
        ]

        new_df_reference[date_column_name] = pd.to_datetime(new_df_reference[date_column_name])

        new_df_reference = new_df_reference.loc[
            (new_df_reference[date_column_name] >= start_date) &
            (new_df_reference[date_column_name] <= end_date)
        ]

        return new_df_reference



[docs]
    def reset_index(self):
        """
        Resets the dataframe index.
        """
        self.df = self.df.reset_index()

    

[docs]
    def get_column_in_list(self, column):
        """
        Returns a dataframe column as a Python list.

        Args:
            column (str):
                Column name.

        Returns:
            list:
                Column values as a list.
        """
        return self.df[column].tolist()

    

[docs]
    def sort_elements_list(self, sort_list):
        """
        Sorts the dataframe by the specified columns.

        Args:
            sort_list (list):
                List of column names to sort by.
        """
        self.df = self.df.sort_values(by=sort_list)

    

[docs]
    def order_columns(self, order_list):
        """
        Reorders the dataframe columns.

        Args:
            order_list (list):
                Desired column order.
        """
        self.df = self.df[order_list]



[docs]
    def group_element(self, group_element):
        """
        Groups the dataframe by the specified column(s).

        Args:
            group_element (str or list):
                Column(s) to group by.
        """
        self.df = self.df.groupby(group_element)



[docs]
    def find_row_date_greater_or_equals_than_indicated(self, date_str) -> tuple[bool, int]:
        """
        Finds the first row where the date is greater than or equal to the given date.

        Args:
            date_str (str or datetime):
                Reference date.

        Returns:
            tuple:
                (True, index) if found, otherwise (False, 0).
        """
        column_date = pd.to_datetime(self.df['DT_COMPTC'])
        mask = column_date >= date_str

        if mask.any():
            idx = mask.idxmax()
            return [True, idx]
        else:
            return [False, 0]



[docs]
    def find_row_data(self, row):
        """
        Retrieves a row from the dataframe by index.

        Args:
            row (int):
                Row index.

        Returns:
            pandas.Series:
                Row data.
        """
        return self.df.iloc[row]




__all__ = ["PandasDataframe"]