diagram_ph/postComputation.py

import pandas as pd

class DataSegmentProcessor:
    def __init__(self, dataframe):
        self.df = dataframe

    def round_and_deduplicate(self, df):
        return df.round(2).drop_duplicates()

    def filter_and_deduplicate_all_segments(self, tol):
        """
            Filters and deduplicates the DataFrame based on a tolerance value around the maximum and minimum pressure values.

            Args:
                tol (float): The tolerance value to use for filtering around the maximum and minimum pressure values.

            Returns:
                Tuple[pandas.DataFrame, pandas.DataFrame, pandas.DataFrame]: A tuple containing three DataFrames:
                    - max_range: The rows where the pressure is within the tolerance range of the maximum pressure.
                    - min_range: The rows where the pressure is within the tolerance range of the minimum pressure.
                    - remaining_df: The remaining rows after removing the max_range and min_range rows from the original DataFrame.
            """
        max_pressure = self.df['Pressure'].max()
        min_pressure = self.df['Pressure'].min()

        max_range = self.df[(self.df['Pressure'] >= max_pressure - tol) & (self.df['Pressure'] <= max_pressure + tol)]
        min_range = self.df[(self.df['Pressure'] >= min_pressure - tol) & (self.df['Pressure'] <= min_pressure + tol)]
        remaining_df = self.df.drop(max_range.index).drop(min_range.index)

        max_range = self.round_and_deduplicate(max_range)
        min_range = self.round_and_deduplicate(min_range)
        remaining_df = self.round_and_deduplicate(remaining_df)

        return max_range, min_range, remaining_df

    def split_based_on_pressure_difference(self, final_circ,pressure_diff_threshold):
        """
        Splits a given DataFrame into two halves based on a pressure difference threshold.

        Args:
            final_circ (pandas.DataFrame): The input DataFrame to be split.
            pressure_diff_threshold (float): The pressure difference threshold value.

        Returns:
            Tuple[pandas.DataFrame, pandas.DataFrame]: A tuple containing the lower and upper halves of the input DataFrame.
                If the input DataFrame is empty, both halves will be empty DataFrames.
                If the input DataFrame has only one row, the lower half will be an empty DataFrame, and the upper half will be the original DataFrame.
        """
        if len(final_circ) == 0 :
             return pd.DataFrame(),pd.DataFrame()
        sorted_df = final_circ.sort_values(by='Pressure')
        if len(sorted_df) == 1 :
            return pd.DataFrame(),sorted_df
        pressure_diff = sorted_df['Pressure'].diff()
        split_index = pressure_diff[pressure_diff > pressure_diff_threshold].first_valid_index()
        if split_index is not None and not sorted_df.empty:
            lower_half = final_circ.loc[:split_index]
            upper_half = final_circ.loc[split_index:]
            return lower_half, upper_half
        return pd.DataFrame()

    def sort_and_assign_orders(self, max_range, min_range, upper_half):
        # Sorting based on specific criteria
        max_range.sort_values(by=['Pressure', 'Enthalpy'], inplace=True, ascending=False)
        min_range.sort_values(by=['Enthalpy','Pressure'], inplace=True, ascending=[True, False])

        last_upper_order = 1
        if len(upper_half) !=0 :
        # Assigning order
            upper_half.sort_values(by=['Enthalpy','Pressure'], inplace=True, ascending=[True, False])
            upper_half['Order'] = range(1, len(upper_half) + 1)
            last_upper_order = upper_half['Order'].iloc[-1] if not upper_half.empty else 0
            max_range['Order'] = range(last_upper_order + 1, len(max_range) + last_upper_order + 1)
            last_max_order = max_range['Order'].iloc[-1]
            min_range['Order'] = range(last_max_order + 1, len(min_range) + last_max_order + 1)
        else:
            max_range['Order'] = range(last_upper_order + 1, len(max_range) + last_upper_order + 1)
            last_max_order = max_range['Order'].iloc[-1]
            min_range['Order'] = range(last_max_order + 1, len(min_range) + last_max_order + 1)

        combined_df = pd.concat([upper_half, max_range, min_range])

# Implement sorting and order assignment
        return combined_df

    def group_by_enthalpy_and_pressure(self, combined_df):
        # Identifier les lignes avec la même enthalpie et une différence de pression > 100 kPa
        combined_df['Group'] = None  # Initialiser la colonne 'Group'
        group_id = 1

        # Trier le DataFrame par 'Enthalpy' pour regrouper les valeurs identiques
        PHsorted = combined_df.sort_values(by='Enthalpy')

        for enthalpy, group in PHsorted.groupby('Enthalpy'):

            # Calculer la différence de pression max - min dans le groupe
            pressure_diff = group['Pressure'].max() - group['Pressure'].min()
            if pressure_diff > 10000 :
                # Attribuer un identifiant de groupe unique si la condition est remplie
                PHsorted.loc[group.index, 'Group'] = group_id
                group_id += 1
        for enthalpy, group in PHsorted.groupby('Enthalpy'):
            # Calculer la différence de pression max - min dans le groupe
            pressure_diff = group['Pressure'].max() - group['Pressure'].min()
            if pressure_diff > 10000:
#                 print(pressure_diff)
                # Attribuer un identifiant de groupe unique si la condition est remplie
                PHsorted.loc[group.index, 'Group'] = group_id
                group_id += 1
        PHsorted.sort_values('Order',inplace=True)
        PHsorted.at[PHsorted.index[-1], 'Group'] = group_id
        quality_dernier_element = PHsorted.at[PHsorted.index[-1], 'Quality']
        idx_first_positive_quality = PHsorted[PHsorted['Quality'] > quality_dernier_element].index[0]
        PHsorted.at[idx_first_positive_quality, 'Group'] = group_id

        return PHsorted

    def run(self,pressure_diff_threshold=120e3):
        max_range_circ, min_range_circ, final_circ = self.filter_and_deduplicate_all_segments(pressure_diff_threshold)
        lower_half, upper_half = self.split_based_on_pressure_difference(final_circ,8000)
        combined_df = self.sort_and_assign_orders(max_range_circ, min_range_circ, upper_half)
        grouped_df = self.group_by_enthalpy_and_pressure(combined_df)
        return grouped_df