Initial commit

2026-01-11 22:04:05 +01:00
commit 87a8b6b844
549 changed files with 96211 additions and 0 deletions
--- a/backend/app/core/engine/ingest.py
+++ b/backend/app/core/engine/ingest.py
@@ -0,0 +1,56 @@
+import pandas as pd
+import pyarrow as pa
+import io
+from typing import Tuple, Dict, Any
+
+def parse_file(file_content: bytes, filename: str) -> pd.DataFrame:
+    """
+    Parses the uploaded file (Excel or CSV) into a Pandas DataFrame.
+    """
+    file_obj = io.BytesIO(file_content)
+    
+    if filename.endswith(('.xlsx', '.xls')):
+        df = pd.read_excel(file_obj)
+    elif filename.endswith('.csv'):
+        # Attempt to detect common delimiters if needed, default to comma
+        df = pd.read_csv(file_obj)
+    else:
+        raise ValueError(f"Unsupported file format: {filename}")
+    
+    # Basic hygiene: strip whitespace from headers
+    df.columns = [str(c).strip() for c in df.columns]
+    
+    return df
+
+def get_column_metadata(df: pd.DataFrame) -> list:
+    """
+    Returns a list of column metadata (name and inferred type).
+    """
+    metadata = []
+    for col in df.columns:
+        dtype = str(df[col].dtype)
+        # Simplify types for the frontend
+        inferred_type = "numeric"
+        if "object" in dtype or "string" in dtype:
+            inferred_type = "categorical"
+        elif "datetime" in dtype:
+            inferred_type = "date"
+        elif "bool" in dtype:
+            inferred_type = "boolean"
+            
+        metadata.append({
+            "name": col,
+            "type": inferred_type,
+            "native_type": dtype
+        })
+    return metadata
+
+def dataframe_to_arrow_stream(df: pd.DataFrame) -> bytes:
+    """
+    Converts a Pandas DataFrame to an Apache Arrow IPC stream.
+    """
+    table = pa.Table.from_pandas(df)
+    sink = pa.BufferOutputStream()
+    with pa.ipc.new_stream(sink, table.schema) as writer:
+        writer.write_table(table)
+    return sink.getvalue().to_pybytes()