Initial commit

This commit is contained in:
2026-01-11 22:04:05 +01:00
commit 87a8b6b844
549 changed files with 96211 additions and 0 deletions

View File

@@ -0,0 +1,147 @@
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from typing import List, Any, Dict, Optional
import pandas as pd
import numpy as np
from app.core.engine.clean import detect_univariate_outliers, detect_multivariate_outliers, merge_outliers, merge_outliers_structured
from app.core.engine.stats import calculate_correlation_matrix, calculate_feature_importance, run_regression_analysis
router = APIRouter(prefix="/analysis", tags=["analysis"])
class TypeValidationRequest(BaseModel):
data: List[Any]
target_type: str
class OutlierDetectionRequest(BaseModel):
data: List[Dict[str, Optional[Any]]]
columns: List[str]
method: str = "both"
excluded_indices: List[int] = [] # Rows to exclude from outlier detection
class CorrelationRequest(BaseModel):
data: List[Dict[str, Optional[Any]]]
columns: List[str]
method: str = "pearson" # pearson, spearman, kendall
min_threshold: Optional[float] = None # Optional minimum correlation threshold
include_pvalues: bool = True
class FeatureImportanceRequest(BaseModel):
data: List[Dict[str, Optional[Any]]]
features: List[str]
target: str
class RegressionRequest(BaseModel):
data: List[Dict[str, Optional[Any]]]
x_features: List[str]
y_target: str
model_type: str = "linear"
# New Engineering Parameters
poly_degree: int = 1 # Default to linear
include_interactions: bool = False
@router.post("/validate-type")
async def validate_type_conversion(request: TypeValidationRequest):
s = pd.Series(request.data)
try:
if request.target_type == "numeric":
pd.to_numeric(s, errors='raise')
elif request.target_type == "date":
pd.to_datetime(s, errors='raise')
return {"status": "ok", "valid": True}
except Exception as e:
return {"status": "error", "valid": False, "message": str(e)}
@router.post("/detect-outliers")
async def detect_outliers(request: OutlierDetectionRequest):
if not request.data:
return {"outliers": []}
df = pd.DataFrame(request.data).fillna(np.nan)
# Pass excluded indices to detection functions
uni_results = detect_univariate_outliers(
df, request.columns, request.excluded_indices
) if request.method in ["univariate", "both"] else {}
multi_results = detect_multivariate_outliers(
df, request.columns, request.excluded_indices
) if request.method in ["multivariate", "both"] else {}
# Use the new structured merge function
structured = merge_outliers_structured(uni_results, multi_results)
return {
"status": "ok",
"total_count": len(structured["all"]),
"outliers": structured["all"], # Backwards compatibility
"univariate": structured["univariate"], # New: Column-specific outliers
"multivariate": structured["multivariate"] # New: Global outliers
}
@router.post("/correlation")
async def get_correlation(request: CorrelationRequest):
if not request.data or not request.columns:
return {
"status": "error",
"message": "Data and columns are required",
"result": {"matrix": [], "pvalues": [], "metadata": {}}
}
df = pd.DataFrame(request.data).fillna(np.nan)
# Validate method parameter
valid_methods = ['pearson', 'spearman', 'kendall']
if request.method not in valid_methods:
raise HTTPException(
status_code=400,
detail=f"Invalid method. Choose from: {', '.join(valid_methods)}"
)
try:
result = calculate_correlation_matrix(
df,
request.columns,
method=request.method,
min_threshold=request.min_threshold,
include_pvalues=request.include_pvalues
)
# Add summary statistics
from app.core.engine.stats import get_correlation_summary
summary = get_correlation_summary(result)
return {
"status": "ok",
"result": result,
"summary": summary
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Correlation calculation failed: {str(e)}")
@router.post("/feature-importance")
async def get_feature_importance(request: FeatureImportanceRequest):
if not request.data or not request.features or not request.target: return {"importances": []}
df = pd.DataFrame(request.data).fillna(np.nan)
return {"status": "ok", "importances": calculate_feature_importance(df, request.features, request.target)}
@router.post("/run-regression")
async def run_regression(request: RegressionRequest):
if not request.data or not request.x_features or not request.y_target:
raise HTTPException(status_code=400, detail="Incomplete parameters.")
df = pd.DataFrame(request.data).fillna(np.nan)
try:
results = run_regression_analysis(
df,
request.x_features,
request.y_target,
request.model_type,
request.poly_degree,
request.include_interactions
)
return {"status": "ok", "results": results}
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
raise HTTPException(status_code=500, detail=f"Internal Analysis Error: {str(e)}")

View File

@@ -0,0 +1,33 @@
from fastapi import APIRouter, HTTPException, Response
from pydantic import BaseModel
from typing import Dict, Any, List
from app.core.engine.reports import create_pdf_report
router = APIRouter(prefix="/reports", tags=["reporting"])
class ExportRequest(BaseModel):
project_name: str
results: Dict[str, Any]
audit_trail: Dict[str, Any]
@router.post("/export")
async def export_report(request: ExportRequest):
"""
Generates and returns a PDF report.
"""
try:
pdf_bytes = create_pdf_report(
request.project_name,
request.results,
request.audit_trail
)
return Response(
content=pdf_bytes,
media_type="application/pdf",
headers={
"Content-Disposition": f"attachment; filename=Report_{request.project_name}.pdf"
}
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@@ -0,0 +1,44 @@
from fastapi import APIRouter, UploadFile, File, HTTPException
from fastapi.responses import StreamingResponse
import io
import json
from app.core.engine.ingest import parse_file, get_column_metadata, dataframe_to_arrow_stream
router = APIRouter(prefix="/upload", tags=["ingestion"])
@router.post("")
async def upload_file(file: UploadFile = File(...)):
"""
Endpoint to upload Excel/CSV files and receive an Apache Arrow stream.
Metadata about columns is sent in the X-Column-Metadata header.
"""
# 1. Validation
if not file.filename.endswith(('.xlsx', '.xls', '.csv')):
raise HTTPException(status_code=400, detail="Only .xlsx, .xls and .csv files are supported.")
try:
content = await file.read()
# 2. Parsing
df = parse_file(content, file.filename)
# 3. Metadata Extraction
metadata = get_column_metadata(df)
# 4. Conversion to Arrow
arrow_bytes = dataframe_to_arrow_stream(df)
# We use a StreamingResponse to send the binary Arrow data.
# Metadata is sent as a custom header (JSON stringified).
return StreamingResponse(
io.BytesIO(arrow_bytes),
media_type="application/vnd.apache.arrow.stream",
headers={
"X-Column-Metadata": json.dumps(metadata),
"Access-Control-Expose-Headers": "X-Column-Metadata"
}
)
except Exception as e:
# In a real app, we'd log this properly
raise HTTPException(status_code=400, detail=f"Error processing file: {str(e)}")