57 lines
1.7 KiB
Python
57 lines
1.7 KiB
Python
import pandas as pd
|
|
import pyarrow as pa
|
|
import io
|
|
from typing import Tuple, Dict, Any
|
|
|
|
def parse_file(file_content: bytes, filename: str) -> pd.DataFrame:
|
|
"""
|
|
Parses the uploaded file (Excel or CSV) into a Pandas DataFrame.
|
|
"""
|
|
file_obj = io.BytesIO(file_content)
|
|
|
|
if filename.endswith(('.xlsx', '.xls')):
|
|
df = pd.read_excel(file_obj)
|
|
elif filename.endswith('.csv'):
|
|
# Attempt to detect common delimiters if needed, default to comma
|
|
df = pd.read_csv(file_obj)
|
|
else:
|
|
raise ValueError(f"Unsupported file format: {filename}")
|
|
|
|
# Basic hygiene: strip whitespace from headers
|
|
df.columns = [str(c).strip() for c in df.columns]
|
|
|
|
return df
|
|
|
|
def get_column_metadata(df: pd.DataFrame) -> list:
|
|
"""
|
|
Returns a list of column metadata (name and inferred type).
|
|
"""
|
|
metadata = []
|
|
for col in df.columns:
|
|
dtype = str(df[col].dtype)
|
|
# Simplify types for the frontend
|
|
inferred_type = "numeric"
|
|
if "object" in dtype or "string" in dtype:
|
|
inferred_type = "categorical"
|
|
elif "datetime" in dtype:
|
|
inferred_type = "date"
|
|
elif "bool" in dtype:
|
|
inferred_type = "boolean"
|
|
|
|
metadata.append({
|
|
"name": col,
|
|
"type": inferred_type,
|
|
"native_type": dtype
|
|
})
|
|
return metadata
|
|
|
|
def dataframe_to_arrow_stream(df: pd.DataFrame) -> bytes:
|
|
"""
|
|
Converts a Pandas DataFrame to an Apache Arrow IPC stream.
|
|
"""
|
|
table = pa.Table.from_pandas(df)
|
|
sink = pa.BufferOutputStream()
|
|
with pa.ipc.new_stream(sink, table.schema) as writer:
|
|
writer.write_table(table)
|
|
return sink.getvalue().to_pybytes()
|