Analysis/backend/app/api/v1/analysis.py
2026-01-11 22:56:02 +01:00

148 lines
5.2 KiB
Python

from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from typing import List, Any, Dict, Optional
import pandas as pd
import numpy as np
from app.core.engine.clean import detect_univariate_outliers, detect_multivariate_outliers, merge_outliers, merge_outliers_structured
from app.core.engine.stats import calculate_correlation_matrix, calculate_feature_importance, run_regression_analysis
router = APIRouter(prefix="/analysis", tags=["analysis"])
class TypeValidationRequest(BaseModel):
data: List[Any]
target_type: str
class OutlierDetectionRequest(BaseModel):
data: List[Dict[str, Optional[Any]]]
columns: List[str]
method: str = "both"
excluded_indices: List[int] = [] # Rows to exclude from outlier detection
class CorrelationRequest(BaseModel):
data: List[Dict[str, Optional[Any]]]
columns: List[str]
method: str = "pearson" # pearson, spearman, kendall
min_threshold: Optional[float] = None # Optional minimum correlation threshold
include_pvalues: bool = True
class FeatureImportanceRequest(BaseModel):
data: List[Dict[str, Optional[Any]]]
features: List[str]
target: str
class RegressionRequest(BaseModel):
data: List[Dict[str, Optional[Any]]]
x_features: List[str]
y_target: str
model_type: str = "linear"
# New Engineering Parameters
poly_degree: int = 1 # Default to linear
include_interactions: bool = False
@router.post("/validate-type")
async def validate_type_conversion(request: TypeValidationRequest):
s = pd.Series(request.data)
try:
if request.target_type == "numeric":
pd.to_numeric(s, errors='raise')
elif request.target_type == "date":
pd.to_datetime(s, errors='raise')
return {"status": "ok", "valid": True}
except Exception as e:
return {"status": "error", "valid": False, "message": str(e)}
@router.post("/detect-outliers")
async def detect_outliers(request: OutlierDetectionRequest):
if not request.data:
return {"outliers": []}
df = pd.DataFrame(request.data).fillna(np.nan)
# Pass excluded indices to detection functions
uni_results = detect_univariate_outliers(
df, request.columns, request.excluded_indices
) if request.method in ["univariate", "both"] else {}
multi_results = detect_multivariate_outliers(
df, request.columns, request.excluded_indices
) if request.method in ["multivariate", "both"] else {}
# Use the new structured merge function
structured = merge_outliers_structured(uni_results, multi_results)
return {
"status": "ok",
"total_count": len(structured["all"]),
"outliers": structured["all"], # Backwards compatibility
"univariate": structured["univariate"], # New: Column-specific outliers
"multivariate": structured["multivariate"] # New: Global outliers
}
@router.post("/correlation")
async def get_correlation(request: CorrelationRequest):
if not request.data or not request.columns:
return {
"status": "error",
"message": "Data and columns are required",
"result": {"matrix": [], "pvalues": [], "metadata": {}}
}
df = pd.DataFrame(request.data).fillna(np.nan)
# Validate method parameter
valid_methods = ['pearson', 'spearman', 'kendall']
if request.method not in valid_methods:
raise HTTPException(
status_code=400,
detail=f"Invalid method. Choose from: {', '.join(valid_methods)}"
)
try:
result = calculate_correlation_matrix(
df,
request.columns,
method=request.method,
min_threshold=request.min_threshold,
include_pvalues=request.include_pvalues
)
# Add summary statistics
from app.core.engine.stats import get_correlation_summary
summary = get_correlation_summary(result)
return {
"status": "ok",
"result": result,
"summary": summary
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Correlation calculation failed: {str(e)}")
@router.post("/feature-importance")
async def get_feature_importance(request: FeatureImportanceRequest):
if not request.data or not request.features or not request.target: return {"importances": []}
df = pd.DataFrame(request.data).fillna(np.nan)
return {"status": "ok", "importances": calculate_feature_importance(df, request.features, request.target)}
@router.post("/run-regression")
async def run_regression(request: RegressionRequest):
if not request.data or not request.x_features or not request.y_target:
raise HTTPException(status_code=400, detail="Incomplete parameters.")
df = pd.DataFrame(request.data).fillna(np.nan)
try:
results = run_regression_analysis(
df,
request.x_features,
request.y_target,
request.model_type,
request.poly_degree,
request.include_interactions
)
return {"status": "ok", "results": results}
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
raise HTTPException(status_code=500, detail=f"Internal Analysis Error: {str(e)}")