diagram_ph/ExcelDataProcessor.py

129 lines
5.6 KiB
Python

import pandas as pd
import re
import json
#refactor the following class
class ExcelDataProcessor:
def __init__(self, config_path, config_file=True):
self.errors = [] # Initialize the error list
if config_file:
self.config = self.load_configuration(config_path)
if self.config:
self.dict_df = self.load_data_from_excel()
else:
self.dict_df = {}
def setconfiguration(self, config):
try:
self.config = config
self.dict_df = self.load_data_from_excel()
except Exception as e:
self.errors.append(f"Failed to set configuration: {e}")
def load_configuration(self, config_path):
"""Load the configuration from a JSON file."""
try:
with open(config_path, 'r') as file:
return json.load(file)
except Exception as e:
self.errors.append(f"Failed to load configuration: {e}")
return None
def load_data_from_excel(self):
"""Load data from an Excel file based on the configuration."""
try:
sheet_names = list(self.config["sheet_names"].values())
excel_data = pd.read_excel(
self.config["excel_file_path"],
sheet_name=sheet_names,
engine='openpyxl',
)
return {key: excel_data[value] for key, value in self.config["sheet_names"].items() if value in excel_data}
except Exception as e:
self.errors.append(f"Failed to load Excel data: {e}")
return {}
def rename_columns(self,circuit):
"""Rename columns based on a mapping DataFrame."""
data = self.dict_df.get("data")
mapping = self.dict_df.get("mapping")
if data is None or mapping is None:
self.errors.append("Data or mapping is missing.")
return
try:
rename_dict = {
row[circuit]: row['Variable in List']
for _, row in mapping.iterrows()
if row[circuit] in data.columns
and row[circuit] != row['Variable in List']
}
# Applying the renaming
renamed_data = data.rename(columns=rename_dict, inplace=False) # Use inplace=False for debugging
self.dict_df['data'] = renamed_data # Update the data only if rename is successful
except Exception as e:
self.errors.append(f"Error renaming columns: {e}")
def evaluate_equations(self):
"""Evaluate equations and apply them to the data, highlighting inconsistencies."""
data = self.dict_df.get("data")
equations = self.dict_df.get("equations")
if data is None or equations is None:
self.errors.append("Data or equations are missing.")
return
used_columns = set()
equation_columns = [] # List to store the names of the columns created by equations
# Extracting specific equations based on configuration
for _, equation in equations[self.config['processing_rules']['equation_columns'][0]].items():
if isinstance(equation, str):
try:
variables = re.findall(r'\b[\w\.]+\b', equation)
non_numeric_variables = [var for var in variables if not var.isnumeric() and var in data.columns]
if all(var in data.columns for var in non_numeric_variables):
# Define a function to evaluate the equation:
def evaluate_equation(row, eq=equation): # Default parameter captures the equation for use in lambda
namespace = row.to_dict()
return eval(eq, {}, namespace)
# Apply this function to each row of the data DataFrame:
data[equation] = data.apply(evaluate_equation, axis=1)
equation_columns.append(equation)
used_columns.update(non_numeric_variables)
else:
self.errors.append({"equation": equation, "error": "Contains unknown variables."})
except Exception as e:
self.errors.append({"equation": equation, "error": str(e)})
# If there's an error in evaluation, mark the entire column to highlight inconsistencies
data[equation] = 'Inconsistent' # Mark all values in this new column as 'Inconsistent'
else:
self.errors.append({"equation": str(equation), "error": "Not a string."})
equation_columns= equation_columns+ ['Conditions','Units']
data_r = data [equation_columns]
data_r.columns = equation_columns
return data_r# Return the modified data at the end of the function, not inside the loop
def process(self,circuit):
"""Method to execute the entire processing."""
self.rename_columns(circuit)
return self.evaluate_equations()
# if self.errors:
# print("Errors occurred during processing:")
# for error in self.errors:
# print(error)
# else:
# print("Processing completed without errors.")
# config_path = r"C:\Users\serameza\impact\EMEA_MBD_GitHub\CheckLabdata\config.json"
# processor = ExcelDataProcessor(config_path)
# processor.process()
# After processing, you can access the processed DataFrame via processor.dict_df["Labdata"] and any errors in processor.errors