Initial commit: Data Analysis application with FastAPI backend and Next.js frontend
This commit is contained in:
147
backend/app/api/v1/analysis.py
Normal file
147
backend/app/api/v1/analysis.py
Normal file
@@ -0,0 +1,147 @@
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Any, Dict, Optional
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from app.core.engine.clean import detect_univariate_outliers, detect_multivariate_outliers, merge_outliers, merge_outliers_structured
|
||||
from app.core.engine.stats import calculate_correlation_matrix, calculate_feature_importance, run_regression_analysis
|
||||
|
||||
router = APIRouter(prefix="/analysis", tags=["analysis"])
|
||||
|
||||
class TypeValidationRequest(BaseModel):
|
||||
data: List[Any]
|
||||
target_type: str
|
||||
|
||||
class OutlierDetectionRequest(BaseModel):
|
||||
data: List[Dict[str, Optional[Any]]]
|
||||
columns: List[str]
|
||||
method: str = "both"
|
||||
excluded_indices: List[int] = [] # Rows to exclude from outlier detection
|
||||
|
||||
class CorrelationRequest(BaseModel):
|
||||
data: List[Dict[str, Optional[Any]]]
|
||||
columns: List[str]
|
||||
method: str = "pearson" # pearson, spearman, kendall
|
||||
min_threshold: Optional[float] = None # Optional minimum correlation threshold
|
||||
include_pvalues: bool = True
|
||||
|
||||
class FeatureImportanceRequest(BaseModel):
|
||||
data: List[Dict[str, Optional[Any]]]
|
||||
features: List[str]
|
||||
target: str
|
||||
|
||||
class RegressionRequest(BaseModel):
|
||||
data: List[Dict[str, Optional[Any]]]
|
||||
x_features: List[str]
|
||||
y_target: str
|
||||
model_type: str = "linear"
|
||||
# New Engineering Parameters
|
||||
poly_degree: int = 1 # Default to linear
|
||||
include_interactions: bool = False
|
||||
|
||||
@router.post("/validate-type")
|
||||
async def validate_type_conversion(request: TypeValidationRequest):
|
||||
s = pd.Series(request.data)
|
||||
try:
|
||||
if request.target_type == "numeric":
|
||||
pd.to_numeric(s, errors='raise')
|
||||
elif request.target_type == "date":
|
||||
pd.to_datetime(s, errors='raise')
|
||||
return {"status": "ok", "valid": True}
|
||||
except Exception as e:
|
||||
return {"status": "error", "valid": False, "message": str(e)}
|
||||
|
||||
@router.post("/detect-outliers")
|
||||
async def detect_outliers(request: OutlierDetectionRequest):
|
||||
if not request.data:
|
||||
return {"outliers": []}
|
||||
|
||||
df = pd.DataFrame(request.data).fillna(np.nan)
|
||||
|
||||
# Pass excluded indices to detection functions
|
||||
uni_results = detect_univariate_outliers(
|
||||
df, request.columns, request.excluded_indices
|
||||
) if request.method in ["univariate", "both"] else {}
|
||||
|
||||
multi_results = detect_multivariate_outliers(
|
||||
df, request.columns, request.excluded_indices
|
||||
) if request.method in ["multivariate", "both"] else {}
|
||||
|
||||
# Use the new structured merge function
|
||||
structured = merge_outliers_structured(uni_results, multi_results)
|
||||
|
||||
return {
|
||||
"status": "ok",
|
||||
"total_count": len(structured["all"]),
|
||||
"outliers": structured["all"], # Backwards compatibility
|
||||
"univariate": structured["univariate"], # New: Column-specific outliers
|
||||
"multivariate": structured["multivariate"] # New: Global outliers
|
||||
}
|
||||
|
||||
@router.post("/correlation")
|
||||
async def get_correlation(request: CorrelationRequest):
|
||||
if not request.data or not request.columns:
|
||||
return {
|
||||
"status": "error",
|
||||
"message": "Data and columns are required",
|
||||
"result": {"matrix": [], "pvalues": [], "metadata": {}}
|
||||
}
|
||||
|
||||
df = pd.DataFrame(request.data).fillna(np.nan)
|
||||
|
||||
# Validate method parameter
|
||||
valid_methods = ['pearson', 'spearman', 'kendall']
|
||||
if request.method not in valid_methods:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Invalid method. Choose from: {', '.join(valid_methods)}"
|
||||
)
|
||||
|
||||
try:
|
||||
result = calculate_correlation_matrix(
|
||||
df,
|
||||
request.columns,
|
||||
method=request.method,
|
||||
min_threshold=request.min_threshold,
|
||||
include_pvalues=request.include_pvalues
|
||||
)
|
||||
|
||||
# Add summary statistics
|
||||
from app.core.engine.stats import get_correlation_summary
|
||||
summary = get_correlation_summary(result)
|
||||
|
||||
return {
|
||||
"status": "ok",
|
||||
"result": result,
|
||||
"summary": summary
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Correlation calculation failed: {str(e)}")
|
||||
|
||||
@router.post("/feature-importance")
|
||||
async def get_feature_importance(request: FeatureImportanceRequest):
|
||||
if not request.data or not request.features or not request.target: return {"importances": []}
|
||||
df = pd.DataFrame(request.data).fillna(np.nan)
|
||||
return {"status": "ok", "importances": calculate_feature_importance(df, request.features, request.target)}
|
||||
|
||||
@router.post("/run-regression")
|
||||
async def run_regression(request: RegressionRequest):
|
||||
if not request.data or not request.x_features or not request.y_target:
|
||||
raise HTTPException(status_code=400, detail="Incomplete parameters.")
|
||||
|
||||
df = pd.DataFrame(request.data).fillna(np.nan)
|
||||
|
||||
try:
|
||||
results = run_regression_analysis(
|
||||
df,
|
||||
request.x_features,
|
||||
request.y_target,
|
||||
request.model_type,
|
||||
request.poly_degree,
|
||||
request.include_interactions
|
||||
)
|
||||
return {"status": "ok", "results": results}
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Internal Analysis Error: {str(e)}")
|
||||
33
backend/app/api/v1/reports.py
Normal file
33
backend/app/api/v1/reports.py
Normal file
@@ -0,0 +1,33 @@
|
||||
from fastapi import APIRouter, HTTPException, Response
|
||||
from pydantic import BaseModel
|
||||
from typing import Dict, Any, List
|
||||
from app.core.engine.reports import create_pdf_report
|
||||
|
||||
router = APIRouter(prefix="/reports", tags=["reporting"])
|
||||
|
||||
class ExportRequest(BaseModel):
|
||||
project_name: str
|
||||
results: Dict[str, Any]
|
||||
audit_trail: Dict[str, Any]
|
||||
|
||||
@router.post("/export")
|
||||
async def export_report(request: ExportRequest):
|
||||
"""
|
||||
Generates and returns a PDF report.
|
||||
"""
|
||||
try:
|
||||
pdf_bytes = create_pdf_report(
|
||||
request.project_name,
|
||||
request.results,
|
||||
request.audit_trail
|
||||
)
|
||||
|
||||
return Response(
|
||||
content=pdf_bytes,
|
||||
media_type="application/pdf",
|
||||
headers={
|
||||
"Content-Disposition": f"attachment; filename=Report_{request.project_name}.pdf"
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
44
backend/app/api/v1/upload.py
Normal file
44
backend/app/api/v1/upload.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from fastapi import APIRouter, UploadFile, File, HTTPException
|
||||
from fastapi.responses import StreamingResponse
|
||||
import io
|
||||
import json
|
||||
from app.core.engine.ingest import parse_file, get_column_metadata, dataframe_to_arrow_stream
|
||||
|
||||
router = APIRouter(prefix="/upload", tags=["ingestion"])
|
||||
|
||||
@router.post("")
|
||||
async def upload_file(file: UploadFile = File(...)):
|
||||
"""
|
||||
Endpoint to upload Excel/CSV files and receive an Apache Arrow stream.
|
||||
Metadata about columns is sent in the X-Column-Metadata header.
|
||||
"""
|
||||
# 1. Validation
|
||||
if not file.filename.endswith(('.xlsx', '.xls', '.csv')):
|
||||
raise HTTPException(status_code=400, detail="Only .xlsx, .xls and .csv files are supported.")
|
||||
|
||||
try:
|
||||
content = await file.read()
|
||||
|
||||
# 2. Parsing
|
||||
df = parse_file(content, file.filename)
|
||||
|
||||
# 3. Metadata Extraction
|
||||
metadata = get_column_metadata(df)
|
||||
|
||||
# 4. Conversion to Arrow
|
||||
arrow_bytes = dataframe_to_arrow_stream(df)
|
||||
|
||||
# We use a StreamingResponse to send the binary Arrow data.
|
||||
# Metadata is sent as a custom header (JSON stringified).
|
||||
return StreamingResponse(
|
||||
io.BytesIO(arrow_bytes),
|
||||
media_type="application/vnd.apache.arrow.stream",
|
||||
headers={
|
||||
"X-Column-Metadata": json.dumps(metadata),
|
||||
"Access-Control-Expose-Headers": "X-Column-Metadata"
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# In a real app, we'd log this properly
|
||||
raise HTTPException(status_code=400, detail=f"Error processing file: {str(e)}")
|
||||
Reference in New Issue
Block a user