123 lines
6.4 KiB
Python
123 lines
6.4 KiB
Python
import pandas as pd
|
|
|
|
class DataSegmentProcessor:
|
|
def __init__(self, dataframe):
|
|
self.df = dataframe
|
|
|
|
def round_and_deduplicate(self, df):
|
|
return df.round(2).drop_duplicates()
|
|
|
|
def filter_and_deduplicate_all_segments(self, tol):
|
|
"""
|
|
Filters and deduplicates the DataFrame based on a tolerance value around the maximum and minimum pressure values.
|
|
|
|
Args:
|
|
tol (float): The tolerance value to use for filtering around the maximum and minimum pressure values.
|
|
|
|
Returns:
|
|
Tuple[pandas.DataFrame, pandas.DataFrame, pandas.DataFrame]: A tuple containing three DataFrames:
|
|
- max_range: The rows where the pressure is within the tolerance range of the maximum pressure.
|
|
- min_range: The rows where the pressure is within the tolerance range of the minimum pressure.
|
|
- remaining_df: The remaining rows after removing the max_range and min_range rows from the original DataFrame.
|
|
"""
|
|
max_pressure = self.df['Pressure'].max()
|
|
min_pressure = self.df['Pressure'].min()
|
|
|
|
max_range = self.df[(self.df['Pressure'] >= max_pressure - tol) & (self.df['Pressure'] <= max_pressure + tol)]
|
|
min_range = self.df[(self.df['Pressure'] >= min_pressure - tol) & (self.df['Pressure'] <= min_pressure + tol)]
|
|
remaining_df = self.df.drop(max_range.index).drop(min_range.index)
|
|
|
|
max_range = self.round_and_deduplicate(max_range)
|
|
min_range = self.round_and_deduplicate(min_range)
|
|
remaining_df = self.round_and_deduplicate(remaining_df)
|
|
|
|
return max_range, min_range, remaining_df
|
|
|
|
def split_based_on_pressure_difference(self, final_circ,pressure_diff_threshold):
|
|
"""
|
|
Splits a given DataFrame into two halves based on a pressure difference threshold.
|
|
|
|
Args:
|
|
final_circ (pandas.DataFrame): The input DataFrame to be split.
|
|
pressure_diff_threshold (float): The pressure difference threshold value.
|
|
|
|
Returns:
|
|
Tuple[pandas.DataFrame, pandas.DataFrame]: A tuple containing the lower and upper halves of the input DataFrame.
|
|
If the input DataFrame is empty, both halves will be empty DataFrames.
|
|
If the input DataFrame has only one row, the lower half will be an empty DataFrame, and the upper half will be the original DataFrame.
|
|
"""
|
|
if len(final_circ) == 0 :
|
|
return pd.DataFrame(),pd.DataFrame()
|
|
sorted_df = final_circ.sort_values(by='Pressure')
|
|
if len(sorted_df) == 1 :
|
|
return pd.DataFrame(),sorted_df
|
|
pressure_diff = sorted_df['Pressure'].diff()
|
|
split_index = pressure_diff[pressure_diff > pressure_diff_threshold].first_valid_index()
|
|
if split_index is not None and not sorted_df.empty:
|
|
lower_half = final_circ.loc[:split_index]
|
|
upper_half = final_circ.loc[split_index:]
|
|
return lower_half, upper_half
|
|
return pd.DataFrame()
|
|
|
|
def sort_and_assign_orders(self, max_range, min_range, upper_half):
|
|
# Sorting based on specific criteria
|
|
max_range.sort_values(by=['Pressure', 'Enthalpy'], inplace=True, ascending=False)
|
|
min_range.sort_values(by=['Enthalpy','Pressure'], inplace=True, ascending=[True, False])
|
|
|
|
last_upper_order = 1
|
|
if len(upper_half) !=0 :
|
|
# Assigning order
|
|
upper_half.sort_values(by=['Enthalpy','Pressure'], inplace=True, ascending=[True, False])
|
|
upper_half['Order'] = range(1, len(upper_half) + 1)
|
|
last_upper_order = upper_half['Order'].iloc[-1] if not upper_half.empty else 0
|
|
max_range['Order'] = range(last_upper_order + 1, len(max_range) + last_upper_order + 1)
|
|
last_max_order = max_range['Order'].iloc[-1]
|
|
min_range['Order'] = range(last_max_order + 1, len(min_range) + last_max_order + 1)
|
|
else:
|
|
max_range['Order'] = range(last_upper_order + 1, len(max_range) + last_upper_order + 1)
|
|
last_max_order = max_range['Order'].iloc[-1]
|
|
min_range['Order'] = range(last_max_order + 1, len(min_range) + last_max_order + 1)
|
|
|
|
combined_df = pd.concat([upper_half, max_range, min_range])
|
|
|
|
# Implement sorting and order assignment
|
|
return combined_df
|
|
|
|
def group_by_enthalpy_and_pressure(self, combined_df):
|
|
# Identifier les lignes avec la même enthalpie et une différence de pression > 100 kPa
|
|
combined_df['Group'] = None # Initialiser la colonne 'Group'
|
|
group_id = 1
|
|
|
|
# Trier le DataFrame par 'Enthalpy' pour regrouper les valeurs identiques
|
|
PHsorted = combined_df.sort_values(by='Enthalpy')
|
|
|
|
for enthalpy, group in PHsorted.groupby('Enthalpy'):
|
|
|
|
# Calculer la différence de pression max - min dans le groupe
|
|
pressure_diff = group['Pressure'].max() - group['Pressure'].min()
|
|
if pressure_diff > 10000 :
|
|
# Attribuer un identifiant de groupe unique si la condition est remplie
|
|
PHsorted.loc[group.index, 'Group'] = group_id
|
|
group_id += 1
|
|
for enthalpy, group in PHsorted.groupby('Enthalpy'):
|
|
# Calculer la différence de pression max - min dans le groupe
|
|
pressure_diff = group['Pressure'].max() - group['Pressure'].min()
|
|
if pressure_diff > 10000:
|
|
# print(pressure_diff)
|
|
# Attribuer un identifiant de groupe unique si la condition est remplie
|
|
PHsorted.loc[group.index, 'Group'] = group_id
|
|
group_id += 1
|
|
PHsorted.sort_values('Order',inplace=True)
|
|
PHsorted.at[PHsorted.index[-1], 'Group'] = group_id
|
|
quality_dernier_element = PHsorted.at[PHsorted.index[-1], 'Quality']
|
|
idx_first_positive_quality = PHsorted[PHsorted['Quality'] > quality_dernier_element].index[0]
|
|
PHsorted.at[idx_first_positive_quality, 'Group'] = group_id
|
|
|
|
return PHsorted
|
|
|
|
def run(self,pressure_diff_threshold=120e3):
|
|
max_range_circ, min_range_circ, final_circ = self.filter_and_deduplicate_all_segments(pressure_diff_threshold)
|
|
lower_half, upper_half = self.split_based_on_pressure_difference(final_circ,8000)
|
|
combined_df = self.sort_and_assign_orders(max_range_circ, min_range_circ, upper_half)
|
|
grouped_df = self.group_by_enthalpy_and_pressure(combined_df)
|
|
return grouped_df |