148 lines
5.2 KiB
Python
148 lines
5.2 KiB
Python
from fastapi import APIRouter, HTTPException
|
|
from pydantic import BaseModel
|
|
from typing import List, Any, Dict, Optional
|
|
import pandas as pd
|
|
import numpy as np
|
|
from app.core.engine.clean import detect_univariate_outliers, detect_multivariate_outliers, merge_outliers, merge_outliers_structured
|
|
from app.core.engine.stats import calculate_correlation_matrix, calculate_feature_importance, run_regression_analysis
|
|
|
|
router = APIRouter(prefix="/analysis", tags=["analysis"])
|
|
|
|
class TypeValidationRequest(BaseModel):
|
|
data: List[Any]
|
|
target_type: str
|
|
|
|
class OutlierDetectionRequest(BaseModel):
|
|
data: List[Dict[str, Optional[Any]]]
|
|
columns: List[str]
|
|
method: str = "both"
|
|
excluded_indices: List[int] = [] # Rows to exclude from outlier detection
|
|
|
|
class CorrelationRequest(BaseModel):
|
|
data: List[Dict[str, Optional[Any]]]
|
|
columns: List[str]
|
|
method: str = "pearson" # pearson, spearman, kendall
|
|
min_threshold: Optional[float] = None # Optional minimum correlation threshold
|
|
include_pvalues: bool = True
|
|
|
|
class FeatureImportanceRequest(BaseModel):
|
|
data: List[Dict[str, Optional[Any]]]
|
|
features: List[str]
|
|
target: str
|
|
|
|
class RegressionRequest(BaseModel):
|
|
data: List[Dict[str, Optional[Any]]]
|
|
x_features: List[str]
|
|
y_target: str
|
|
model_type: str = "linear"
|
|
# New Engineering Parameters
|
|
poly_degree: int = 1 # Default to linear
|
|
include_interactions: bool = False
|
|
|
|
@router.post("/validate-type")
|
|
async def validate_type_conversion(request: TypeValidationRequest):
|
|
s = pd.Series(request.data)
|
|
try:
|
|
if request.target_type == "numeric":
|
|
pd.to_numeric(s, errors='raise')
|
|
elif request.target_type == "date":
|
|
pd.to_datetime(s, errors='raise')
|
|
return {"status": "ok", "valid": True}
|
|
except Exception as e:
|
|
return {"status": "error", "valid": False, "message": str(e)}
|
|
|
|
@router.post("/detect-outliers")
|
|
async def detect_outliers(request: OutlierDetectionRequest):
|
|
if not request.data:
|
|
return {"outliers": []}
|
|
|
|
df = pd.DataFrame(request.data).fillna(np.nan)
|
|
|
|
# Pass excluded indices to detection functions
|
|
uni_results = detect_univariate_outliers(
|
|
df, request.columns, request.excluded_indices
|
|
) if request.method in ["univariate", "both"] else {}
|
|
|
|
multi_results = detect_multivariate_outliers(
|
|
df, request.columns, request.excluded_indices
|
|
) if request.method in ["multivariate", "both"] else {}
|
|
|
|
# Use the new structured merge function
|
|
structured = merge_outliers_structured(uni_results, multi_results)
|
|
|
|
return {
|
|
"status": "ok",
|
|
"total_count": len(structured["all"]),
|
|
"outliers": structured["all"], # Backwards compatibility
|
|
"univariate": structured["univariate"], # New: Column-specific outliers
|
|
"multivariate": structured["multivariate"] # New: Global outliers
|
|
}
|
|
|
|
@router.post("/correlation")
|
|
async def get_correlation(request: CorrelationRequest):
|
|
if not request.data or not request.columns:
|
|
return {
|
|
"status": "error",
|
|
"message": "Data and columns are required",
|
|
"result": {"matrix": [], "pvalues": [], "metadata": {}}
|
|
}
|
|
|
|
df = pd.DataFrame(request.data).fillna(np.nan)
|
|
|
|
# Validate method parameter
|
|
valid_methods = ['pearson', 'spearman', 'kendall']
|
|
if request.method not in valid_methods:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Invalid method. Choose from: {', '.join(valid_methods)}"
|
|
)
|
|
|
|
try:
|
|
result = calculate_correlation_matrix(
|
|
df,
|
|
request.columns,
|
|
method=request.method,
|
|
min_threshold=request.min_threshold,
|
|
include_pvalues=request.include_pvalues
|
|
)
|
|
|
|
# Add summary statistics
|
|
from app.core.engine.stats import get_correlation_summary
|
|
summary = get_correlation_summary(result)
|
|
|
|
return {
|
|
"status": "ok",
|
|
"result": result,
|
|
"summary": summary
|
|
}
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Correlation calculation failed: {str(e)}")
|
|
|
|
@router.post("/feature-importance")
|
|
async def get_feature_importance(request: FeatureImportanceRequest):
|
|
if not request.data or not request.features or not request.target: return {"importances": []}
|
|
df = pd.DataFrame(request.data).fillna(np.nan)
|
|
return {"status": "ok", "importances": calculate_feature_importance(df, request.features, request.target)}
|
|
|
|
@router.post("/run-regression")
|
|
async def run_regression(request: RegressionRequest):
|
|
if not request.data or not request.x_features or not request.y_target:
|
|
raise HTTPException(status_code=400, detail="Incomplete parameters.")
|
|
|
|
df = pd.DataFrame(request.data).fillna(np.nan)
|
|
|
|
try:
|
|
results = run_regression_analysis(
|
|
df,
|
|
request.x_features,
|
|
request.y_target,
|
|
request.model_type,
|
|
request.poly_degree,
|
|
request.include_interactions
|
|
)
|
|
return {"status": "ok", "results": results}
|
|
except ValueError as e:
|
|
raise HTTPException(status_code=400, detail=str(e))
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Internal Analysis Error: {str(e)}")
|