diagram_ph/ExcelDataProcessor.py

import pandas as pd
import re
import json
#refactor the following class

class ExcelDataProcessor:
    def __init__(self, config_path, config_file=True):
        self.errors = []  # Initialize the error list
        if config_file:
            self.config = self.load_configuration(config_path)
            if self.config:
                self.dict_df = self.load_data_from_excel()
            else:
                self.dict_df = {}

    def setconfiguration(self, config):
        try:
            self.config = config
            self.dict_df = self.load_data_from_excel()
        except Exception as e:
            self.errors.append(f"Failed to set configuration: {e}")


    def load_configuration(self, config_path):
        """Load the configuration from a JSON file."""
        try:
            with open(config_path, 'r') as file:
                return json.load(file)
        except Exception as e:
            self.errors.append(f"Failed to load configuration: {e}")
            return None

    def load_data_from_excel(self):
        """Load data from an Excel file based on the configuration."""
        try:
            sheet_names = list(self.config["sheet_names"].values())
            excel_data = pd.read_excel(
                self.config["excel_file_path"],
                sheet_name=sheet_names,
                engine='openpyxl',
            )
            return {key: excel_data[value] for key, value in self.config["sheet_names"].items() if value in excel_data}
        except Exception as e:
            self.errors.append(f"Failed to load Excel data: {e}")
            return {}

    def rename_columns(self,circuit):
        """Rename columns based on a mapping DataFrame."""
        data = self.dict_df.get("data")
        mapping = self.dict_df.get("mapping")
        if data is None or mapping is None:
            self.errors.append("Data or mapping is missing.")
            return

        try:
            rename_dict = {
                row[circuit]: row['Variable in List']
                for _, row in mapping.iterrows()
                if row[circuit] in data.columns
                and row[circuit] != row['Variable in List']
            }
                    # Applying the renaming
            renamed_data = data.rename(columns=rename_dict, inplace=False)  # Use inplace=False for debugging
            self.dict_df['data'] = renamed_data  # Update the data only if rename is successful
        except Exception as e:
            self.errors.append(f"Error renaming columns: {e}")

    def evaluate_equations(self):
        """Evaluate equations and apply them to the data, highlighting inconsistencies."""

        data = self.dict_df.get("data")
        equations = self.dict_df.get("equations")
        if data is None or equations is None:
            self.errors.append("Data or equations are missing.")
            return

        used_columns = set()
        equation_columns = []  # List to store the names of the columns created by equations

        # Extracting specific equations based on configuration
        for _, equation in equations[self.config['processing_rules']['equation_columns'][0]].items():
            if isinstance(equation, str):
                try:
                    variables = re.findall(r'\b[\w\.]+\b', equation)
                    non_numeric_variables = [var for var in variables if not var.isnumeric() and var in data.columns]

                    if all(var in data.columns for var in non_numeric_variables):
                        # Define a function to evaluate the equation:
                        def evaluate_equation(row, eq=equation):  # Default parameter captures the equation for use in lambda
                            namespace = row.to_dict()
                            return eval(eq, {}, namespace)

                        # Apply this function to each row of the data DataFrame:
                        data[equation] = data.apply(evaluate_equation, axis=1)
                        equation_columns.append(equation)
                        used_columns.update(non_numeric_variables)
                    else:
                        self.errors.append({"equation": equation, "error": "Contains unknown variables."})
                except Exception as e:
                    self.errors.append({"equation": equation, "error": str(e)})
                    # If there's an error in evaluation, mark the entire column to highlight inconsistencies
                    data[equation] = 'Inconsistent'  # Mark all values in this new column as 'Inconsistent'
            else:
                self.errors.append({"equation": str(equation), "error": "Not a string."})
        equation_columns= equation_columns+  ['Conditions','Units']
        data_r = data [equation_columns]
        data_r.columns = equation_columns

        return  data_r# Return the modified data at the end of the function, not inside the loop


    def process(self,circuit):
        """Method to execute the entire processing."""
        self.rename_columns(circuit)
        return self.evaluate_equations()
        # if self.errors:
        #     print("Errors occurred during processing:")
        #     for error in self.errors:
        #         print(error)
        # else:
        #     print("Processing completed without errors.")


# config_path = r"C:\Users\serameza\impact\EMEA_MBD_GitHub\CheckLabdata\config.json"
# processor = ExcelDataProcessor(config_path)
# processor.process()
# After processing, you can access the processed DataFrame via processor.dict_df["Labdata"] and any errors in processor.errors