2026-01-11 22:56:02 +01:00

57 lines
1.7 KiB
Python

import pandas as pd
import pyarrow as pa
import io
from typing import Tuple, Dict, Any
def parse_file(file_content: bytes, filename: str) -> pd.DataFrame:
"""
Parses the uploaded file (Excel or CSV) into a Pandas DataFrame.
"""
file_obj = io.BytesIO(file_content)
if filename.endswith(('.xlsx', '.xls')):
df = pd.read_excel(file_obj)
elif filename.endswith('.csv'):
# Attempt to detect common delimiters if needed, default to comma
df = pd.read_csv(file_obj)
else:
raise ValueError(f"Unsupported file format: {filename}")
# Basic hygiene: strip whitespace from headers
df.columns = [str(c).strip() for c in df.columns]
return df
def get_column_metadata(df: pd.DataFrame) -> list:
"""
Returns a list of column metadata (name and inferred type).
"""
metadata = []
for col in df.columns:
dtype = str(df[col].dtype)
# Simplify types for the frontend
inferred_type = "numeric"
if "object" in dtype or "string" in dtype:
inferred_type = "categorical"
elif "datetime" in dtype:
inferred_type = "date"
elif "bool" in dtype:
inferred_type = "boolean"
metadata.append({
"name": col,
"type": inferred_type,
"native_type": dtype
})
return metadata
def dataframe_to_arrow_stream(df: pd.DataFrame) -> bytes:
"""
Converts a Pandas DataFrame to an Apache Arrow IPC stream.
"""
table = pa.Table.from_pandas(df)
sink = pa.BufferOutputStream()
with pa.ipc.new_stream(sink, table.schema) as writer:
writer.write_table(table)
return sink.getvalue().to_pybytes()