129 lines
5.6 KiB
Python
129 lines
5.6 KiB
Python
import pandas as pd
|
|
import re
|
|
import json
|
|
#refactor the following class
|
|
|
|
class ExcelDataProcessor:
|
|
def __init__(self, config_path, config_file=True):
|
|
self.errors = [] # Initialize the error list
|
|
if config_file:
|
|
self.config = self.load_configuration(config_path)
|
|
if self.config:
|
|
self.dict_df = self.load_data_from_excel()
|
|
else:
|
|
self.dict_df = {}
|
|
|
|
def setconfiguration(self, config):
|
|
try:
|
|
self.config = config
|
|
self.dict_df = self.load_data_from_excel()
|
|
except Exception as e:
|
|
self.errors.append(f"Failed to set configuration: {e}")
|
|
|
|
|
|
def load_configuration(self, config_path):
|
|
"""Load the configuration from a JSON file."""
|
|
try:
|
|
with open(config_path, 'r') as file:
|
|
return json.load(file)
|
|
except Exception as e:
|
|
self.errors.append(f"Failed to load configuration: {e}")
|
|
return None
|
|
|
|
def load_data_from_excel(self):
|
|
"""Load data from an Excel file based on the configuration."""
|
|
try:
|
|
sheet_names = list(self.config["sheet_names"].values())
|
|
excel_data = pd.read_excel(
|
|
self.config["excel_file_path"],
|
|
sheet_name=sheet_names,
|
|
engine='openpyxl',
|
|
)
|
|
return {key: excel_data[value] for key, value in self.config["sheet_names"].items() if value in excel_data}
|
|
except Exception as e:
|
|
self.errors.append(f"Failed to load Excel data: {e}")
|
|
return {}
|
|
|
|
def rename_columns(self,circuit):
|
|
"""Rename columns based on a mapping DataFrame."""
|
|
data = self.dict_df.get("data")
|
|
mapping = self.dict_df.get("mapping")
|
|
if data is None or mapping is None:
|
|
self.errors.append("Data or mapping is missing.")
|
|
return
|
|
|
|
try:
|
|
rename_dict = {
|
|
row[circuit]: row['Variable in List']
|
|
for _, row in mapping.iterrows()
|
|
if row[circuit] in data.columns
|
|
and row[circuit] != row['Variable in List']
|
|
}
|
|
# Applying the renaming
|
|
renamed_data = data.rename(columns=rename_dict, inplace=False) # Use inplace=False for debugging
|
|
self.dict_df['data'] = renamed_data # Update the data only if rename is successful
|
|
except Exception as e:
|
|
self.errors.append(f"Error renaming columns: {e}")
|
|
|
|
def evaluate_equations(self):
|
|
"""Evaluate equations and apply them to the data, highlighting inconsistencies."""
|
|
|
|
data = self.dict_df.get("data")
|
|
equations = self.dict_df.get("equations")
|
|
if data is None or equations is None:
|
|
self.errors.append("Data or equations are missing.")
|
|
return
|
|
|
|
used_columns = set()
|
|
equation_columns = [] # List to store the names of the columns created by equations
|
|
|
|
# Extracting specific equations based on configuration
|
|
for _, equation in equations[self.config['processing_rules']['equation_columns'][0]].items():
|
|
if isinstance(equation, str):
|
|
try:
|
|
variables = re.findall(r'\b[\w\.]+\b', equation)
|
|
non_numeric_variables = [var for var in variables if not var.isnumeric() and var in data.columns]
|
|
|
|
if all(var in data.columns for var in non_numeric_variables):
|
|
# Define a function to evaluate the equation:
|
|
def evaluate_equation(row, eq=equation): # Default parameter captures the equation for use in lambda
|
|
namespace = row.to_dict()
|
|
return eval(eq, {}, namespace)
|
|
|
|
# Apply this function to each row of the data DataFrame:
|
|
data[equation] = data.apply(evaluate_equation, axis=1)
|
|
equation_columns.append(equation)
|
|
used_columns.update(non_numeric_variables)
|
|
else:
|
|
self.errors.append({"equation": equation, "error": "Contains unknown variables."})
|
|
except Exception as e:
|
|
self.errors.append({"equation": equation, "error": str(e)})
|
|
# If there's an error in evaluation, mark the entire column to highlight inconsistencies
|
|
data[equation] = 'Inconsistent' # Mark all values in this new column as 'Inconsistent'
|
|
else:
|
|
self.errors.append({"equation": str(equation), "error": "Not a string."})
|
|
equation_columns= equation_columns+ ['Conditions','Units']
|
|
data_r = data [equation_columns]
|
|
data_r.columns = equation_columns
|
|
|
|
return data_r# Return the modified data at the end of the function, not inside the loop
|
|
|
|
|
|
def process(self,circuit):
|
|
"""Method to execute the entire processing."""
|
|
self.rename_columns(circuit)
|
|
return self.evaluate_equations()
|
|
# if self.errors:
|
|
# print("Errors occurred during processing:")
|
|
# for error in self.errors:
|
|
# print(error)
|
|
# else:
|
|
# print("Processing completed without errors.")
|
|
|
|
|
|
|
|
# config_path = r"C:\Users\serameza\impact\EMEA_MBD_GitHub\CheckLabdata\config.json"
|
|
# processor = ExcelDataProcessor(config_path)
|
|
# processor.process()
|
|
# After processing, you can access the processed DataFrame via processor.dict_df["Labdata"] and any errors in processor.errors
|