Initial commit
This commit is contained in:
1
backend/.python-version
Normal file
1
backend/.python-version
Normal file
@@ -0,0 +1 @@
|
||||
3.12
|
||||
20
backend/Dockerfile
Normal file
20
backend/Dockerfile
Normal file
@@ -0,0 +1,20 @@
|
||||
# Simple Dockerfile - install everything in one place
|
||||
FROM python:3.12-slim-bookworm
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENV PYTHONUNBUFFERED=1 \
|
||||
PYTHONDONTWRITEBYTECODE=1
|
||||
|
||||
# Install pip packages directly (no uv complexity)
|
||||
COPY pyproject.toml ./
|
||||
RUN pip install --no-cache-dir -e .
|
||||
|
||||
# Copy application
|
||||
COPY . .
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000
|
||||
|
||||
# Run directly
|
||||
CMD ["python", "-m", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
0
backend/README.md
Normal file
0
backend/README.md
Normal file
147
backend/app/api/v1/analysis.py
Normal file
147
backend/app/api/v1/analysis.py
Normal file
@@ -0,0 +1,147 @@
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Any, Dict, Optional
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from app.core.engine.clean import detect_univariate_outliers, detect_multivariate_outliers, merge_outliers, merge_outliers_structured
|
||||
from app.core.engine.stats import calculate_correlation_matrix, calculate_feature_importance, run_regression_analysis
|
||||
|
||||
router = APIRouter(prefix="/analysis", tags=["analysis"])
|
||||
|
||||
class TypeValidationRequest(BaseModel):
|
||||
data: List[Any]
|
||||
target_type: str
|
||||
|
||||
class OutlierDetectionRequest(BaseModel):
|
||||
data: List[Dict[str, Optional[Any]]]
|
||||
columns: List[str]
|
||||
method: str = "both"
|
||||
excluded_indices: List[int] = [] # Rows to exclude from outlier detection
|
||||
|
||||
class CorrelationRequest(BaseModel):
|
||||
data: List[Dict[str, Optional[Any]]]
|
||||
columns: List[str]
|
||||
method: str = "pearson" # pearson, spearman, kendall
|
||||
min_threshold: Optional[float] = None # Optional minimum correlation threshold
|
||||
include_pvalues: bool = True
|
||||
|
||||
class FeatureImportanceRequest(BaseModel):
|
||||
data: List[Dict[str, Optional[Any]]]
|
||||
features: List[str]
|
||||
target: str
|
||||
|
||||
class RegressionRequest(BaseModel):
|
||||
data: List[Dict[str, Optional[Any]]]
|
||||
x_features: List[str]
|
||||
y_target: str
|
||||
model_type: str = "linear"
|
||||
# New Engineering Parameters
|
||||
poly_degree: int = 1 # Default to linear
|
||||
include_interactions: bool = False
|
||||
|
||||
@router.post("/validate-type")
|
||||
async def validate_type_conversion(request: TypeValidationRequest):
|
||||
s = pd.Series(request.data)
|
||||
try:
|
||||
if request.target_type == "numeric":
|
||||
pd.to_numeric(s, errors='raise')
|
||||
elif request.target_type == "date":
|
||||
pd.to_datetime(s, errors='raise')
|
||||
return {"status": "ok", "valid": True}
|
||||
except Exception as e:
|
||||
return {"status": "error", "valid": False, "message": str(e)}
|
||||
|
||||
@router.post("/detect-outliers")
|
||||
async def detect_outliers(request: OutlierDetectionRequest):
|
||||
if not request.data:
|
||||
return {"outliers": []}
|
||||
|
||||
df = pd.DataFrame(request.data).fillna(np.nan)
|
||||
|
||||
# Pass excluded indices to detection functions
|
||||
uni_results = detect_univariate_outliers(
|
||||
df, request.columns, request.excluded_indices
|
||||
) if request.method in ["univariate", "both"] else {}
|
||||
|
||||
multi_results = detect_multivariate_outliers(
|
||||
df, request.columns, request.excluded_indices
|
||||
) if request.method in ["multivariate", "both"] else {}
|
||||
|
||||
# Use the new structured merge function
|
||||
structured = merge_outliers_structured(uni_results, multi_results)
|
||||
|
||||
return {
|
||||
"status": "ok",
|
||||
"total_count": len(structured["all"]),
|
||||
"outliers": structured["all"], # Backwards compatibility
|
||||
"univariate": structured["univariate"], # New: Column-specific outliers
|
||||
"multivariate": structured["multivariate"] # New: Global outliers
|
||||
}
|
||||
|
||||
@router.post("/correlation")
|
||||
async def get_correlation(request: CorrelationRequest):
|
||||
if not request.data or not request.columns:
|
||||
return {
|
||||
"status": "error",
|
||||
"message": "Data and columns are required",
|
||||
"result": {"matrix": [], "pvalues": [], "metadata": {}}
|
||||
}
|
||||
|
||||
df = pd.DataFrame(request.data).fillna(np.nan)
|
||||
|
||||
# Validate method parameter
|
||||
valid_methods = ['pearson', 'spearman', 'kendall']
|
||||
if request.method not in valid_methods:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Invalid method. Choose from: {', '.join(valid_methods)}"
|
||||
)
|
||||
|
||||
try:
|
||||
result = calculate_correlation_matrix(
|
||||
df,
|
||||
request.columns,
|
||||
method=request.method,
|
||||
min_threshold=request.min_threshold,
|
||||
include_pvalues=request.include_pvalues
|
||||
)
|
||||
|
||||
# Add summary statistics
|
||||
from app.core.engine.stats import get_correlation_summary
|
||||
summary = get_correlation_summary(result)
|
||||
|
||||
return {
|
||||
"status": "ok",
|
||||
"result": result,
|
||||
"summary": summary
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Correlation calculation failed: {str(e)}")
|
||||
|
||||
@router.post("/feature-importance")
|
||||
async def get_feature_importance(request: FeatureImportanceRequest):
|
||||
if not request.data or not request.features or not request.target: return {"importances": []}
|
||||
df = pd.DataFrame(request.data).fillna(np.nan)
|
||||
return {"status": "ok", "importances": calculate_feature_importance(df, request.features, request.target)}
|
||||
|
||||
@router.post("/run-regression")
|
||||
async def run_regression(request: RegressionRequest):
|
||||
if not request.data or not request.x_features or not request.y_target:
|
||||
raise HTTPException(status_code=400, detail="Incomplete parameters.")
|
||||
|
||||
df = pd.DataFrame(request.data).fillna(np.nan)
|
||||
|
||||
try:
|
||||
results = run_regression_analysis(
|
||||
df,
|
||||
request.x_features,
|
||||
request.y_target,
|
||||
request.model_type,
|
||||
request.poly_degree,
|
||||
request.include_interactions
|
||||
)
|
||||
return {"status": "ok", "results": results}
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Internal Analysis Error: {str(e)}")
|
||||
33
backend/app/api/v1/reports.py
Normal file
33
backend/app/api/v1/reports.py
Normal file
@@ -0,0 +1,33 @@
|
||||
from fastapi import APIRouter, HTTPException, Response
|
||||
from pydantic import BaseModel
|
||||
from typing import Dict, Any, List
|
||||
from app.core.engine.reports import create_pdf_report
|
||||
|
||||
router = APIRouter(prefix="/reports", tags=["reporting"])
|
||||
|
||||
class ExportRequest(BaseModel):
|
||||
project_name: str
|
||||
results: Dict[str, Any]
|
||||
audit_trail: Dict[str, Any]
|
||||
|
||||
@router.post("/export")
|
||||
async def export_report(request: ExportRequest):
|
||||
"""
|
||||
Generates and returns a PDF report.
|
||||
"""
|
||||
try:
|
||||
pdf_bytes = create_pdf_report(
|
||||
request.project_name,
|
||||
request.results,
|
||||
request.audit_trail
|
||||
)
|
||||
|
||||
return Response(
|
||||
content=pdf_bytes,
|
||||
media_type="application/pdf",
|
||||
headers={
|
||||
"Content-Disposition": f"attachment; filename=Report_{request.project_name}.pdf"
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
44
backend/app/api/v1/upload.py
Normal file
44
backend/app/api/v1/upload.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from fastapi import APIRouter, UploadFile, File, HTTPException
|
||||
from fastapi.responses import StreamingResponse
|
||||
import io
|
||||
import json
|
||||
from app.core.engine.ingest import parse_file, get_column_metadata, dataframe_to_arrow_stream
|
||||
|
||||
router = APIRouter(prefix="/upload", tags=["ingestion"])
|
||||
|
||||
@router.post("")
|
||||
async def upload_file(file: UploadFile = File(...)):
|
||||
"""
|
||||
Endpoint to upload Excel/CSV files and receive an Apache Arrow stream.
|
||||
Metadata about columns is sent in the X-Column-Metadata header.
|
||||
"""
|
||||
# 1. Validation
|
||||
if not file.filename.endswith(('.xlsx', '.xls', '.csv')):
|
||||
raise HTTPException(status_code=400, detail="Only .xlsx, .xls and .csv files are supported.")
|
||||
|
||||
try:
|
||||
content = await file.read()
|
||||
|
||||
# 2. Parsing
|
||||
df = parse_file(content, file.filename)
|
||||
|
||||
# 3. Metadata Extraction
|
||||
metadata = get_column_metadata(df)
|
||||
|
||||
# 4. Conversion to Arrow
|
||||
arrow_bytes = dataframe_to_arrow_stream(df)
|
||||
|
||||
# We use a StreamingResponse to send the binary Arrow data.
|
||||
# Metadata is sent as a custom header (JSON stringified).
|
||||
return StreamingResponse(
|
||||
io.BytesIO(arrow_bytes),
|
||||
media_type="application/vnd.apache.arrow.stream",
|
||||
headers={
|
||||
"X-Column-Metadata": json.dumps(metadata),
|
||||
"Access-Control-Expose-Headers": "X-Column-Metadata"
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# In a real app, we'd log this properly
|
||||
raise HTTPException(status_code=400, detail=f"Error processing file: {str(e)}")
|
||||
165
backend/app/core/engine/clean.py
Normal file
165
backend/app/core/engine/clean.py
Normal file
@@ -0,0 +1,165 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.ensemble import IsolationForest
|
||||
from typing import List, Dict, Any
|
||||
|
||||
def detect_univariate_outliers(df: pd.DataFrame, columns: List[str], excluded_indices: List[int] = None) -> Dict[int, List[str]]:
|
||||
"""
|
||||
Detects outliers in specific numeric columns using the Interquartile Range (IQR) method.
|
||||
|
||||
Args:
|
||||
df: Input DataFrame
|
||||
columns: List of column names to analyze
|
||||
excluded_indices: List of row indices to exclude from detection
|
||||
|
||||
Returns:
|
||||
Dictionary of {original_row_index: [reasons]}
|
||||
"""
|
||||
# Exclude specified rows if provided
|
||||
if excluded_indices:
|
||||
df = df[~df.index.isin(excluded_indices)]
|
||||
|
||||
outliers = {}
|
||||
for col in columns:
|
||||
if col not in df.columns:
|
||||
continue
|
||||
|
||||
s = pd.to_numeric(df[col], errors='coerce')
|
||||
q1 = s.quantile(0.25)
|
||||
q3 = s.quantile(0.75)
|
||||
iqr = q3 - q1
|
||||
lower_bound = q1 - 1.5 * iqr
|
||||
upper_bound = q3 + 1.5 * iqr
|
||||
|
||||
flags = (s < lower_bound) | (s > upper_bound)
|
||||
indices = df.index[flags].tolist()
|
||||
|
||||
for idx in indices:
|
||||
val = df.at[idx, col]
|
||||
reason = f"Column '{col}' value {val} is outside IQR bounds [{lower_bound:.2f}, {upper_bound:.2f}]"
|
||||
if idx not in outliers:
|
||||
outliers[idx] = []
|
||||
outliers[idx].append(reason)
|
||||
|
||||
return outliers
|
||||
|
||||
def detect_multivariate_outliers(df: pd.DataFrame, columns: List[str], excluded_indices: List[int] = None) -> Dict[int, List[str]]:
|
||||
"""
|
||||
Detects anomalies across multiple numeric columns using Isolation Forest.
|
||||
|
||||
Args:
|
||||
df: Input DataFrame
|
||||
columns: List of column names to analyze
|
||||
excluded_indices: List of row indices to exclude from detection (already filtered out)
|
||||
|
||||
Returns:
|
||||
Dictionary of {original_row_index: [reasons]}
|
||||
"""
|
||||
# Store original indices
|
||||
original_indices = df.index.tolist()
|
||||
|
||||
# Exclude specified rows if provided
|
||||
if excluded_indices:
|
||||
df = df[~df.index.isin(excluded_indices)]
|
||||
|
||||
# Select only relevant numeric columns
|
||||
numeric_df = df[columns].apply(pd.to_numeric, errors='coerce')
|
||||
|
||||
if numeric_df.empty:
|
||||
return {}
|
||||
|
||||
# Get rows with all values present (no NaNs in selected columns)
|
||||
valid_mask = numeric_df.notna().all(axis=1)
|
||||
numeric_df_clean = numeric_df[valid_mask]
|
||||
|
||||
if numeric_df_clean.empty:
|
||||
return {}
|
||||
|
||||
# Fit Isolation Forest
|
||||
model = IsolationForest(contamination='auto', random_state=42)
|
||||
preds = model.fit_predict(numeric_df_clean)
|
||||
|
||||
# IsolationForest returns -1 for outliers
|
||||
# Get the indices from the clean DataFrame (these are the original indices)
|
||||
outlier_indices = numeric_df_clean.index[preds == -1].tolist()
|
||||
|
||||
return {int(idx): ["Multivariate anomaly detected by Isolation Forest"] for idx in outlier_indices}
|
||||
|
||||
def merge_outliers(uni: Dict[int, List[str]], multi: Dict[int, List[str]]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Merges results into a flat list of outlier objects.
|
||||
DEPRECATED: Use merge_outliers_structured instead for better type separation.
|
||||
"""
|
||||
all_indices = set(uni.keys()) | set(multi.keys())
|
||||
results = []
|
||||
|
||||
for idx in sorted(all_indices):
|
||||
reasons = uni.get(idx, []) + multi.get(idx, [])
|
||||
results.append({
|
||||
"index": int(idx),
|
||||
"reasons": reasons
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
def merge_outliers_structured(uni: Dict[int, List[str]], multi: Dict[int, List[str]]) -> Dict[str, Any]:
|
||||
"""
|
||||
Merges and separates outliers by type for better frontend handling.
|
||||
|
||||
Returns:
|
||||
Dictionary with:
|
||||
- 'univariate': Dict mapping column names to their specific outliers
|
||||
- 'multivariate': List of outliers that affect multiple columns
|
||||
- 'all': Flat list of all outliers (for backwards compatibility)
|
||||
|
||||
This structure allows the frontend to:
|
||||
1. Show column-specific outliers when clicking a column header
|
||||
2. Show global/multivariate outliers in a separate view
|
||||
3. Clearly distinguish between local and global anomalies
|
||||
"""
|
||||
# Extract column names from univariate reasons
|
||||
column_outliers: Dict[str, List[Dict[str, Any]]] = {}
|
||||
|
||||
for idx, reasons in uni.items():
|
||||
for reason in reasons:
|
||||
# Extract column name from reason string
|
||||
# Format: "Column 'Price' value 100 is outside..."
|
||||
if "Column '" in reason:
|
||||
col_start = reason.index("Column '") + 8
|
||||
col_end = reason.index("'", col_start)
|
||||
col_name = reason[col_start:col_end]
|
||||
|
||||
if col_name not in column_outliers:
|
||||
column_outliers[col_name] = []
|
||||
|
||||
# Check if this row index already exists for this column
|
||||
existing = next((x for x in column_outliers[col_name] if x["index"] == idx), None)
|
||||
if not existing:
|
||||
column_outliers[col_name].append({
|
||||
"index": int(idx),
|
||||
"reasons": [reason]
|
||||
})
|
||||
else:
|
||||
existing["reasons"].append(reason)
|
||||
|
||||
# Prepare multivariate outliers
|
||||
multivariate_list = [
|
||||
{"index": int(idx), "reasons": reasons}
|
||||
for idx, reasons in multi.items()
|
||||
]
|
||||
|
||||
# Prepare legacy flat format (backwards compatibility)
|
||||
all_indices = set(uni.keys()) | set(multi.keys())
|
||||
all_outliers = []
|
||||
for idx in sorted(all_indices):
|
||||
reasons = uni.get(idx, []) + multi.get(idx, [])
|
||||
all_outliers.append({
|
||||
"index": int(idx),
|
||||
"reasons": reasons
|
||||
})
|
||||
|
||||
return {
|
||||
"univariate": column_outliers,
|
||||
"multivariate": multivariate_list,
|
||||
"all": all_outliers
|
||||
}
|
||||
56
backend/app/core/engine/ingest.py
Normal file
56
backend/app/core/engine/ingest.py
Normal file
@@ -0,0 +1,56 @@
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
import io
|
||||
from typing import Tuple, Dict, Any
|
||||
|
||||
def parse_file(file_content: bytes, filename: str) -> pd.DataFrame:
|
||||
"""
|
||||
Parses the uploaded file (Excel or CSV) into a Pandas DataFrame.
|
||||
"""
|
||||
file_obj = io.BytesIO(file_content)
|
||||
|
||||
if filename.endswith(('.xlsx', '.xls')):
|
||||
df = pd.read_excel(file_obj)
|
||||
elif filename.endswith('.csv'):
|
||||
# Attempt to detect common delimiters if needed, default to comma
|
||||
df = pd.read_csv(file_obj)
|
||||
else:
|
||||
raise ValueError(f"Unsupported file format: {filename}")
|
||||
|
||||
# Basic hygiene: strip whitespace from headers
|
||||
df.columns = [str(c).strip() for c in df.columns]
|
||||
|
||||
return df
|
||||
|
||||
def get_column_metadata(df: pd.DataFrame) -> list:
|
||||
"""
|
||||
Returns a list of column metadata (name and inferred type).
|
||||
"""
|
||||
metadata = []
|
||||
for col in df.columns:
|
||||
dtype = str(df[col].dtype)
|
||||
# Simplify types for the frontend
|
||||
inferred_type = "numeric"
|
||||
if "object" in dtype or "string" in dtype:
|
||||
inferred_type = "categorical"
|
||||
elif "datetime" in dtype:
|
||||
inferred_type = "date"
|
||||
elif "bool" in dtype:
|
||||
inferred_type = "boolean"
|
||||
|
||||
metadata.append({
|
||||
"name": col,
|
||||
"type": inferred_type,
|
||||
"native_type": dtype
|
||||
})
|
||||
return metadata
|
||||
|
||||
def dataframe_to_arrow_stream(df: pd.DataFrame) -> bytes:
|
||||
"""
|
||||
Converts a Pandas DataFrame to an Apache Arrow IPC stream.
|
||||
"""
|
||||
table = pa.Table.from_pandas(df)
|
||||
sink = pa.BufferOutputStream()
|
||||
with pa.ipc.new_stream(sink, table.schema) as writer:
|
||||
writer.write_table(table)
|
||||
return sink.getvalue().to_pybytes()
|
||||
223
backend/app/core/engine/reports.py
Normal file
223
backend/app/core/engine/reports.py
Normal file
@@ -0,0 +1,223 @@
|
||||
from fpdf import FPDF
|
||||
from datetime import datetime
|
||||
from io import BytesIO
|
||||
import pandas as pd
|
||||
import platform
|
||||
import sklearn
|
||||
import statsmodels
|
||||
import os
|
||||
import matplotlib
|
||||
matplotlib.use('Agg') # Use non-GUI backend
|
||||
import matplotlib.pyplot as plt
|
||||
import tempfile
|
||||
|
||||
class AnalysisReport(FPDF):
|
||||
def header(self):
|
||||
try:
|
||||
# Add Unicode font support for accented characters
|
||||
self.set_font('Arial', 'B', 15)
|
||||
self.set_text_color(79, 70, 229) # Indigo 600
|
||||
self.cell(0, 10, 'Data_analysis - Rapport de Validation', 0, 1, 'L')
|
||||
self.set_draw_color(226, 232, 240)
|
||||
self.line(10, 22, 200, 22)
|
||||
self.ln(10)
|
||||
except Exception as e:
|
||||
print(f"Header error: {e}")
|
||||
|
||||
def footer(self):
|
||||
try:
|
||||
self.set_y(-15)
|
||||
self.set_font('Arial', 'I', 8)
|
||||
self.set_text_color(148, 163, 184)
|
||||
self.cell(0, 10, f'Page {self.page_no()} | Genere le {datetime.now().strftime("%Y-%m-%d %H:%M")}', 0, 0, 'C')
|
||||
except Exception as e:
|
||||
print(f"Footer error: {e}")
|
||||
|
||||
def create_pdf_report(project_name: str, results: dict, audit_trail: dict) -> bytes:
|
||||
try:
|
||||
pdf = AnalysisReport()
|
||||
pdf.add_page()
|
||||
|
||||
# 1. Summary
|
||||
pdf.set_font('Arial', 'B', 12)
|
||||
pdf.set_text_color(51, 65, 85)
|
||||
pdf.cell(0, 10, f"Projet : {project_name}", 0, 1)
|
||||
pdf.ln(5)
|
||||
|
||||
# Handle missing model_type
|
||||
model_type = results.get('model_type', 'Regression')
|
||||
if isinstance(model_type, list):
|
||||
model_type = model_type[0] if model_type else 'Regression'
|
||||
|
||||
pdf.set_font('Arial', '', 10)
|
||||
pdf.cell(0, 8, f"Modele : {model_type}", 0, 1)
|
||||
|
||||
# Handle r_squared safely
|
||||
r_squared = results.get('r_squared', 0)
|
||||
if r_squared is None:
|
||||
r_squared = 0
|
||||
pdf.cell(0, 8, f"Precision (R²) : {float(r_squared):.4f}", 0, 1)
|
||||
|
||||
# Handle sample_size safely
|
||||
sample_size = results.get('sample_size', 0)
|
||||
if sample_size is None:
|
||||
sample_size = 0
|
||||
pdf.cell(0, 8, f"Taille de l'echantillon : {int(sample_size)}", 0, 1)
|
||||
|
||||
pdf.ln(10)
|
||||
|
||||
# 2. Coefficients Table
|
||||
pdf.set_font('Arial', 'B', 11)
|
||||
pdf.cell(0, 10, "Coefficients du Modele", 0, 1)
|
||||
|
||||
pdf.set_font('Arial', 'B', 9)
|
||||
pdf.set_fill_color(248, 250, 252)
|
||||
pdf.cell(80, 8, "Feature", 1, 0, 'L', True)
|
||||
pdf.cell(50, 8, "Coefficient", 1, 0, 'R', True)
|
||||
pdf.cell(50, 8, "P-Value", 1, 1, 'R', True)
|
||||
|
||||
# Get coefficients and p_values safely
|
||||
coefficients = results.get('coefficients', {})
|
||||
p_values = results.get('p_values', {})
|
||||
|
||||
if coefficients:
|
||||
pdf.set_font('Arial', '', 9)
|
||||
for name, coef in coefficients.items():
|
||||
# Convert coef to float safely
|
||||
try:
|
||||
coef_val = float(coef)
|
||||
except (TypeError, ValueError):
|
||||
coef_val = 0.0
|
||||
|
||||
# Get p-value safely
|
||||
p_val = p_values.get(name, 1.0)
|
||||
try:
|
||||
p_val = float(p_val)
|
||||
except (TypeError, ValueError):
|
||||
p_val = 1.0
|
||||
|
||||
pdf.cell(80, 8, str(name), 1)
|
||||
pdf.cell(50, 8, f"{coef_val:.4f}", 1, 0, 'R')
|
||||
|
||||
if p_val < 0.05:
|
||||
pdf.set_text_color(16, 185, 129) # Emerald
|
||||
else:
|
||||
pdf.set_text_color(244, 63, 94) # Rose
|
||||
pdf.cell(50, 8, f"{p_val:.4f}", 1, 1, 'R')
|
||||
pdf.set_text_color(51, 65, 85)
|
||||
else:
|
||||
pdf.set_font('Arial', '', 9)
|
||||
pdf.cell(0, 8, "Aucun coefficient disponible", 0, 1)
|
||||
|
||||
pdf.ln(15)
|
||||
|
||||
# 3. Visualization Charts
|
||||
if 'fit_plot' in results and len(results['fit_plot']) > 0:
|
||||
pdf.set_font('Arial', 'B', 11)
|
||||
pdf.cell(0, 10, "Courbe de Regression", 0, 1)
|
||||
pdf.ln(5)
|
||||
|
||||
# Create fit plot
|
||||
fit_data = results['fit_plot']
|
||||
x_vals = [p['x'] for p in fit_data]
|
||||
y_real = [p['real'] for p in fit_data]
|
||||
y_pred = [p['pred'] for p in fit_data]
|
||||
|
||||
plt.figure(figsize=(10, 6))
|
||||
plt.scatter(x_vals, y_real, alpha=0.6, color='#4f46e5', label='Données réelles', s=50)
|
||||
plt.plot(x_vals, y_pred, color='#ef4444', linewidth=2, label='Courbe de régression')
|
||||
plt.xlabel('Valeur X', fontsize=12)
|
||||
plt.ylabel('Valeur Y', fontsize=12)
|
||||
plt.title('Ajustement du Modèle', fontsize=14, fontweight='bold')
|
||||
plt.legend()
|
||||
plt.grid(True, alpha=0.3)
|
||||
plt.tight_layout()
|
||||
|
||||
# Save plot to temp file and add to PDF
|
||||
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
|
||||
plt.savefig(tmp.name, dpi=150, bbox_inches='tight')
|
||||
plt.close()
|
||||
pdf.image(tmp.name, x=10, w=190)
|
||||
os.unlink(tmp.name)
|
||||
|
||||
pdf.ln(10)
|
||||
|
||||
# Residuals plot
|
||||
if 'diagnostic_plot' in results and len(results['diagnostic_plot']) > 0:
|
||||
pdf.set_font('Arial', 'B', 11)
|
||||
pdf.cell(0, 10, "Graphique des Residus", 0, 1)
|
||||
pdf.ln(5)
|
||||
|
||||
residuals_data = results['diagnostic_plot']
|
||||
fitted = [p['fitted'] for p in residuals_data]
|
||||
residuals = [p['residual'] for p in residuals_data]
|
||||
|
||||
plt.figure(figsize=(10, 6))
|
||||
plt.scatter(fitted, residuals, alpha=0.6, color='#4f46e5', s=50)
|
||||
plt.axhline(y=0, color='#ef4444', linestyle='--', linewidth=2)
|
||||
plt.xlabel('Valeurs Ajustees', fontsize=12)
|
||||
plt.ylabel('Residus', fontsize=12)
|
||||
plt.title('Graphique des Residus', fontsize=14, fontweight='bold')
|
||||
plt.grid(True, alpha=0.3)
|
||||
plt.tight_layout()
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
|
||||
plt.savefig(tmp.name, dpi=150, bbox_inches='tight')
|
||||
plt.close()
|
||||
pdf.image(tmp.name, x=10, w=190)
|
||||
os.unlink(tmp.name)
|
||||
|
||||
pdf.ln(10)
|
||||
|
||||
# 4. Audit Trail (Reproducibility)
|
||||
pdf.set_font('Arial', 'B', 11)
|
||||
pdf.cell(0, 10, "Piste d'Audit & Reproductibilite", 0, 1)
|
||||
|
||||
pdf.set_font('Arial', '', 8)
|
||||
pdf.set_text_color(100, 116, 139)
|
||||
|
||||
# Cleaning steps
|
||||
excluded_count = audit_trail.get('excluded_rows_count', 0)
|
||||
if excluded_count is None:
|
||||
excluded_count = 0
|
||||
pdf.multi_cell(0, 6, f"- Nettoyage : {int(excluded_count)} lignes exclues de l'analyse.")
|
||||
|
||||
# Environment
|
||||
pdf.ln(5)
|
||||
pdf.set_font('Arial', 'B', 8)
|
||||
pdf.cell(0, 6, "Environnement Technique :", 0, 1)
|
||||
pdf.set_font('Arial', '', 8)
|
||||
pdf.cell(0, 5, f"- Python : {platform.python_version()}", 0, 1)
|
||||
pdf.cell(0, 5, f"- Pandas : {pd.__version__}", 0, 1)
|
||||
|
||||
# Try to get sklearn version safely
|
||||
try:
|
||||
pdf.cell(0, 5, f"- Scikit-learn : {sklearn.__version__}", 0, 1)
|
||||
except Exception:
|
||||
pdf.cell(0, 5, "- Scikit-learn : Installé", 0, 1)
|
||||
|
||||
# Try to get statsmodels version safely
|
||||
try:
|
||||
pdf.cell(0, 5, f"- Statsmodels : {statsmodels.__version__}", 0, 1)
|
||||
except Exception:
|
||||
pdf.cell(0, 5, "- Statsmodels : Installé", 0, 1)
|
||||
|
||||
pdf.cell(0, 5, f"- Random Seed : 42 (Fixed)", 0, 1)
|
||||
|
||||
# Generate PDF bytes using BytesIO
|
||||
pdf_buffer = BytesIO()
|
||||
pdf.output(pdf_buffer)
|
||||
return pdf_buffer.getvalue()
|
||||
|
||||
except Exception as e:
|
||||
# Return error as PDF with message using BytesIO
|
||||
error_pdf = FPDF()
|
||||
error_pdf.add_page()
|
||||
error_pdf.set_font('Arial', 'B', 16)
|
||||
error_pdf.cell(0, 10, f"Erreur lors de la generation du PDF", 0, 1)
|
||||
error_pdf.ln(10)
|
||||
error_pdf.set_font('Arial', '', 12)
|
||||
error_pdf.multi_cell(0, 10, f"Erreur: {str(e)}")
|
||||
error_buffer = BytesIO()
|
||||
error_pdf.output(error_buffer)
|
||||
return error_buffer.getvalue()
|
||||
430
backend/app/core/engine/stats.py
Normal file
430
backend/app/core/engine/stats.py
Normal file
@@ -0,0 +1,430 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import statsmodels.api as sm
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
from sklearn.inspection import permutation_importance
|
||||
from sklearn.preprocessing import PolynomialFeatures
|
||||
from scipy import stats
|
||||
from typing import List, Dict, Any, Tuple
|
||||
import sympy as sp
|
||||
|
||||
def calculate_correlation_matrix(
|
||||
df: pd.DataFrame,
|
||||
columns: List[str],
|
||||
method: str = 'pearson',
|
||||
min_threshold: float = None,
|
||||
include_pvalues: bool = True
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Calculate correlation matrix with optional p-values and filtering.
|
||||
|
||||
Args:
|
||||
df: Input DataFrame
|
||||
columns: List of column names to analyze
|
||||
method: Correlation method ('pearson', 'spearman', 'kendall')
|
||||
min_threshold: Minimum absolute correlation value to include (optional)
|
||||
include_pvalues: Whether to calculate statistical significance
|
||||
|
||||
Returns:
|
||||
Dictionary with matrix data, p-values, and metadata
|
||||
"""
|
||||
if not columns:
|
||||
return {"matrix": [], "pvalues": [], "metadata": {}}
|
||||
|
||||
# Convert to numeric and handle missing values
|
||||
numeric_df = df[columns].apply(pd.to_numeric, errors='coerce')
|
||||
|
||||
# Remove columns with too many missing values (>50%)
|
||||
missing_ratios = numeric_df.isnull().sum() / len(numeric_df)
|
||||
valid_cols = missing_ratios[missing_ratios <= 0.5].index.tolist()
|
||||
|
||||
if len(valid_cols) < 2:
|
||||
return {"matrix": [], "pvalues": [], "metadata": {"error": "Need at least 2 valid numeric columns"}}
|
||||
|
||||
# Use pairwise deletion for correlation (more robust than listwise)
|
||||
clean_df = numeric_df[valid_cols]
|
||||
|
||||
# Calculate correlation matrix
|
||||
corr_matrix = clean_df.corr(method=method)
|
||||
|
||||
# Calculate p-values if requested
|
||||
pvalue_matrix = None
|
||||
if include_pvalues:
|
||||
pvalue_matrix = pd.DataFrame(np.zeros_like(corr_matrix),
|
||||
index=corr_matrix.index,
|
||||
columns=corr_matrix.columns)
|
||||
|
||||
for i, col1 in enumerate(corr_matrix.columns):
|
||||
for j, col2 in enumerate(corr_matrix.index):
|
||||
if i != j:
|
||||
# Pairwise complete observations
|
||||
valid_data = clean_df[[col1, col2]].dropna()
|
||||
if len(valid_data) >= 3:
|
||||
if method == 'pearson':
|
||||
_, pval = stats.pearsonr(valid_data.iloc[:, 0], valid_data.iloc[:, 1])
|
||||
elif method == 'spearman':
|
||||
_, pval = stats.spearmanr(valid_data.iloc[:, 0], valid_data.iloc[:, 1])
|
||||
elif method == 'kendall':
|
||||
_, pval = stats.kendalltau(valid_data.iloc[:, 0], valid_data.iloc[:, 1])
|
||||
else:
|
||||
pval = np.nan
|
||||
pvalue_matrix.iloc[i, j] = pval
|
||||
|
||||
# Build results
|
||||
results = []
|
||||
pvalue_results = []
|
||||
|
||||
for x in corr_matrix.columns:
|
||||
for y in corr_matrix.index:
|
||||
value = float(corr_matrix.at[y, x])
|
||||
|
||||
# Apply threshold filter if specified
|
||||
if min_threshold is not None and abs(value) < min_threshold:
|
||||
continue
|
||||
|
||||
results.append({
|
||||
"x": x,
|
||||
"y": y,
|
||||
"value": value,
|
||||
"abs_value": abs(value)
|
||||
})
|
||||
|
||||
if include_pvalues and pvalue_matrix is not None:
|
||||
pvalue_results.append({
|
||||
"x": x,
|
||||
"y": y,
|
||||
"pvalue": float(pvalue_matrix.at[y, x]) if not pd.isna(pvalue_matrix.at[y, x]) else None,
|
||||
"significant": bool((pvalue_matrix.at[y, x] or 1) < 0.05 if not pd.isna(pvalue_matrix.at[y, x]) else False)
|
||||
})
|
||||
|
||||
# Calculate summary statistics
|
||||
n_observations = len(clean_df)
|
||||
|
||||
return {
|
||||
"matrix": results,
|
||||
"pvalues": pvalue_results if include_pvalues else [],
|
||||
"metadata": {
|
||||
"method": method,
|
||||
"n_observations": n_observations,
|
||||
"n_variables": len(valid_cols),
|
||||
"columns_analyzed": valid_cols,
|
||||
"threshold_applied": min_threshold
|
||||
}
|
||||
}
|
||||
|
||||
def get_correlation_summary(correlation_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate summary statistics from correlation data.
|
||||
Identifies strongest correlations (positive and negative).
|
||||
"""
|
||||
matrix = correlation_data.get("matrix", [])
|
||||
|
||||
# Filter out diagonal (self-correlation)
|
||||
off_diagonal = [m for m in matrix if m["x"] != m["y"]]
|
||||
|
||||
if not off_diagonal:
|
||||
return {"strongest": [], "weakest": []}
|
||||
|
||||
# Sort by absolute correlation value
|
||||
sorted_by_abs = sorted(off_diagonal, key=lambda x: x["abs_value"], reverse=True)
|
||||
|
||||
# Get strongest correlations (top 5)
|
||||
strongest = sorted_by_abs[:5]
|
||||
|
||||
# Get weakest correlations (bottom 5, but non-zero)
|
||||
weakest = [m for m in sorted_by_abs if m["abs_value"] > 0][-5:]
|
||||
weakest = sorted(weakest, key=lambda x: x["abs_value"])
|
||||
|
||||
return {
|
||||
"strongest": strongest,
|
||||
"weakest": weakest,
|
||||
"total_pairs": len(off_diagonal)
|
||||
}
|
||||
|
||||
def calculate_feature_importance(df: pd.DataFrame, features: List[str], target: str) -> List[Dict[str, Any]]:
|
||||
if not features or not target: return []
|
||||
df_clean = df.dropna(subset=[target])
|
||||
X = df_clean[features].apply(pd.to_numeric, errors='coerce').fillna(0)
|
||||
y = df_clean[target]
|
||||
if y.dtype == 'object' or y.dtype == 'string': y = pd.factorize(y)[0]
|
||||
model = RandomForestRegressor(n_estimators=100, random_state=42)
|
||||
model.fit(X, y)
|
||||
result = permutation_importance(model, X, y, n_repeats=10, random_state=42, n_jobs=-1)
|
||||
importances = result.importances_mean
|
||||
results = [{"feature": name, "score": max(0, float(score))} for name, score in zip(features, importances)]
|
||||
total = sum(r["score"] for r in results)
|
||||
if total > 0:
|
||||
for r in results: r["score"] /= total
|
||||
return sorted(results, key=lambda x: x["score"], reverse=True)
|
||||
|
||||
def generate_equations(coefficients: Dict[str, float], model_type: str) -> Dict[str, str]:
|
||||
"""
|
||||
Generate equation strings in LaTeX, Python, and Excel formats.
|
||||
|
||||
Args:
|
||||
coefficients: Dictionary of feature names to coefficient values
|
||||
model_type: Type of regression model ('linear', 'polynomial', 'exponential', 'logistic')
|
||||
|
||||
Returns:
|
||||
Dictionary with 'latex', 'python', and 'excel' equation strings
|
||||
"""
|
||||
from sympy import symbols, sympify, latex, Float, preorder_traversal, Mul, Pow
|
||||
|
||||
# Extract intercept
|
||||
intercept = 0.0
|
||||
feature_coefs = {}
|
||||
|
||||
for key, value in coefficients.items():
|
||||
if key in ['const', 'intercept', '(Intercept)']:
|
||||
intercept = float(value)
|
||||
else:
|
||||
feature_coefs[key] = float(value)
|
||||
|
||||
# Helper function to format number cleanly for Python/Excel
|
||||
def format_number(num: float) -> str:
|
||||
"""Format number with 3 decimal places max"""
|
||||
if num == 0:
|
||||
return "0"
|
||||
abs_num = abs(num)
|
||||
# Use scientific notation for very small or very large numbers
|
||||
if abs_num >= 10000 or (abs_num < 0.001 and abs_num > 0):
|
||||
return f"{num:.2e}"
|
||||
# Regular decimal with 3 decimal places max
|
||||
formatted = f"{num:.3f}"
|
||||
# Remove trailing zeros
|
||||
return formatted.rstrip('0').rstrip('.')
|
||||
|
||||
# Build LaTeX with sympy using scientific notation
|
||||
# Create symbols for each variable
|
||||
for name in feature_coefs.keys():
|
||||
safe_name = name.replace(' ', '_').replace('^', '_pow_')
|
||||
symbols(safe_name)
|
||||
|
||||
# Build expression string
|
||||
expr_parts = []
|
||||
intercept_str = f"{intercept:.10f}"
|
||||
expr_parts.append(intercept_str)
|
||||
|
||||
for name, coef in feature_coefs.items():
|
||||
safe_name = name.replace(' ', '_').replace('^', '_pow_')
|
||||
coef_str = f"{coef:.10f}"
|
||||
expr_parts.append(f"{coef_str}*{safe_name}")
|
||||
|
||||
expr_str = " + ".join(expr_parts)
|
||||
expr = sympify(expr_str)
|
||||
|
||||
# Scientific notation rounding function
|
||||
def scientific_round_expr(e, ndigits=2):
|
||||
"""
|
||||
Convert floats to scientific notation with specified decimal places.
|
||||
Example: 12345.678 -> 1.23 × 10^4
|
||||
"""
|
||||
repl = {}
|
||||
for node in preorder_traversal(e):
|
||||
if isinstance(node, Float):
|
||||
val = float(node.evalf(6)) # Get enough precision
|
||||
abs_val = abs(val)
|
||||
|
||||
# Use scientific notation for large or small numbers
|
||||
if abs_val >= 10000 or (abs_val < 0.01 and abs_val > 0):
|
||||
sci_str = f"{val:.{ndigits}e}"
|
||||
mantissa, exponent = sci_str.split('e')
|
||||
# Reconstruct as: mantissa × 10^exponent
|
||||
repl[node] = Mul(Float(mantissa), Pow(10, int(exponent)), evaluate=False)
|
||||
else:
|
||||
# Regular rounding for normal numbers
|
||||
repl[node] = Float(round(val, ndigits))
|
||||
|
||||
return e.xreplace(repl)
|
||||
|
||||
# Apply scientific rounding
|
||||
expr_sci = scientific_round_expr(expr, 2)
|
||||
|
||||
# Convert to LaTeX
|
||||
latex_eq_raw = latex(expr_sci, fold_frac_powers=True, fold_short_frac=True, mul_symbol='times')
|
||||
|
||||
# Replace safe names with readable display names
|
||||
for name in feature_coefs.keys():
|
||||
safe_name = name.replace(' ', '_').replace('^', '_pow_')
|
||||
display_name = name.replace('_', ' ')
|
||||
latex_eq_raw = latex_eq_raw.replace(safe_name, f"\\mathrm{{{display_name}}}")
|
||||
|
||||
# Add "y = " prefix
|
||||
latex_eq = f"y = {latex_eq_raw}"
|
||||
|
||||
# Build Python format
|
||||
python_parts = []
|
||||
for name, coef in feature_coefs.items():
|
||||
coef_str = format_number(coef)
|
||||
if coef >= 0:
|
||||
python_parts.append(f"+ {coef_str}*{name}")
|
||||
else:
|
||||
python_parts.append(f"- {format_number(abs(coef))}*{name}")
|
||||
|
||||
intercept_str_clean = format_number(intercept)
|
||||
python_eq = f"y = {intercept_str_clean} " + ' '.join(python_parts) if python_parts else f"y = {intercept_str_clean}"
|
||||
|
||||
# Generate Excel format
|
||||
col_letters = {name: chr(65 + i) for i, name in enumerate(feature_coefs.keys())}
|
||||
|
||||
excel_parts = []
|
||||
for name, coef in feature_coefs.items():
|
||||
coef_str = format_number(coef)
|
||||
col_letter = col_letters[name]
|
||||
if coef >= 0:
|
||||
excel_parts.append(f"+ {coef_str}*{col_letter}1")
|
||||
else:
|
||||
excel_parts.append(f"- {format_number(abs(coef))}*{col_letter}1")
|
||||
|
||||
excel_eq = f"={intercept_str_clean} " + ' '.join(excel_parts) if excel_parts else f"={intercept_str_clean}"
|
||||
|
||||
return {
|
||||
"latex": latex_eq,
|
||||
"python": python_eq,
|
||||
"excel": excel_eq
|
||||
}
|
||||
|
||||
def run_regression_analysis(df: pd.DataFrame, x_cols: List[str], y_col: str, model_type: str = "linear", poly_degree: int = 1, include_interactions: bool = False) -> Dict[str, Any]:
|
||||
# 1. Prep Data
|
||||
# Capture original X for plotting before transformation
|
||||
X_original = df[x_cols].apply(pd.to_numeric, errors='coerce')
|
||||
y_data = df[y_col]
|
||||
|
||||
# Align indices after dropna
|
||||
data = pd.concat([X_original, y_data], axis=1).dropna()
|
||||
if data.empty or len(data) < len(x_cols) + 1:
|
||||
raise ValueError("Insufficient data.")
|
||||
|
||||
X_raw = data[x_cols] # Keep for plotting
|
||||
y = pd.to_numeric(data[y_col], errors='coerce')
|
||||
|
||||
X = X_raw.copy() # Start with raw for modelling
|
||||
|
||||
# 2. Advanced Feature Engineering
|
||||
if model_type == "polynomial" or include_interactions:
|
||||
degree = poly_degree if model_type == "polynomial" else 2
|
||||
interaction_only = include_interactions and model_type != "polynomial"
|
||||
poly = PolynomialFeatures(degree=degree, interaction_only=interaction_only, include_bias=False)
|
||||
X_poly = poly.fit_transform(X)
|
||||
poly_cols = poly.get_feature_names_out(X.columns)
|
||||
X = pd.DataFrame(X_poly, columns=poly_cols, index=X.index)
|
||||
|
||||
# 3. Model Fitting
|
||||
try:
|
||||
model = None
|
||||
y_pred = None
|
||||
|
||||
if model_type == "logistic":
|
||||
X_const = sm.add_constant(X)
|
||||
y_bin = (y > y.median()).astype(int)
|
||||
model = sm.Logit(y_bin, X_const).fit(disp=0)
|
||||
y_pred = model.predict(X_const)
|
||||
y = y_bin
|
||||
elif model_type == "exponential":
|
||||
if (y <= 0).any(): raise ValueError("Exponential regression requires Y > 0.")
|
||||
y_log = np.log(y)
|
||||
X_const = sm.add_constant(X)
|
||||
lin_model = sm.OLS(y_log, X_const).fit()
|
||||
y_pred = np.exp(lin_model.predict(X_const))
|
||||
model = lin_model
|
||||
else: # Linear or Polynomial
|
||||
X_const = sm.add_constant(X)
|
||||
model = sm.OLS(y, X_const).fit()
|
||||
y_pred = model.predict(X_const)
|
||||
|
||||
# 4. Construct Visualization Data
|
||||
# Create fit plots for each original feature
|
||||
fit_plots_by_feature = {}
|
||||
residuals_vs_fitted = []
|
||||
|
||||
y_list = y.tolist()
|
||||
pred_list = y_pred.tolist()
|
||||
|
||||
residuals = []
|
||||
|
||||
# Create a fit plot for each original feature
|
||||
for feature_name in X_raw.columns:
|
||||
x_feature_list = X_raw[feature_name].tolist()
|
||||
feature_plot = []
|
||||
|
||||
for i in range(len(y_list)):
|
||||
feature_plot.append({
|
||||
"x": float(x_feature_list[i]),
|
||||
"real": float(y_list[i]),
|
||||
"pred": float(pred_list[i])
|
||||
})
|
||||
|
||||
# Sort by X for proper curve rendering
|
||||
feature_plot.sort(key=lambda item: item["x"])
|
||||
fit_plots_by_feature[feature_name] = feature_plot
|
||||
|
||||
# Also create a single fit_plot using the first feature for backward compatibility
|
||||
fit_plot = fit_plots_by_feature[X_raw.columns[0]] if len(X_raw.columns) > 0 else []
|
||||
|
||||
# Residuals plot
|
||||
for i in range(len(y_list)):
|
||||
res_val = y_list[i] - pred_list[i]
|
||||
residuals.append(res_val)
|
||||
|
||||
residuals_vs_fitted.append({
|
||||
"fitted": float(pred_list[i]),
|
||||
"residual": res_val
|
||||
})
|
||||
|
||||
# 5. Calculate Partial Regression Plots (Added Variable Plots)
|
||||
# These show the isolated effect of each variable controlling for others
|
||||
partial_regression_plots = {}
|
||||
|
||||
# Only calculate for multiple regression (more than 1 feature)
|
||||
if len(X_raw.columns) > 1:
|
||||
for feature_name in X_raw.columns:
|
||||
# Get other features (all except current)
|
||||
other_features = [col for col in X_raw.columns if col != feature_name]
|
||||
|
||||
if len(other_features) == 0:
|
||||
continue
|
||||
|
||||
# Step 1: Regress Y on all features except current one
|
||||
X_other = X_raw[other_features]
|
||||
X_other_const = sm.add_constant(X_other)
|
||||
model_y = sm.OLS(y, X_other_const).fit()
|
||||
y_residuals = y - model_y.predict(X_other_const)
|
||||
|
||||
# Step 2: Regress current feature on other features
|
||||
model_x = sm.OLS(X_raw[feature_name], X_other_const).fit()
|
||||
x_residuals = X_raw[feature_name] - model_x.predict(X_other_const)
|
||||
|
||||
# Step 3: Create partial plot data
|
||||
partial_plot = []
|
||||
for i in range(len(y)):
|
||||
partial_plot.append({
|
||||
"x": float(x_residuals.iloc[i]),
|
||||
"y": float(y_residuals.iloc[i])
|
||||
})
|
||||
|
||||
# Sort by x for proper line rendering
|
||||
partial_plot.sort(key=lambda item: item["x"])
|
||||
partial_regression_plots[feature_name] = partial_plot
|
||||
|
||||
# Generate equation strings
|
||||
equations = generate_equations(model.params.to_dict(), model_type)
|
||||
|
||||
summary = {
|
||||
"r_squared": float(model.rsquared) if hasattr(model, 'rsquared') else float(model.prsquared),
|
||||
"adj_r_squared": float(model.rsquared_adj) if hasattr(model, 'rsquared_adj') else None,
|
||||
"aic": float(model.aic),
|
||||
"bic": float(model.bic),
|
||||
"coefficients": model.params.to_dict(),
|
||||
"p_values": model.pvalues.to_dict(),
|
||||
"std_errors": model.bse.to_dict(),
|
||||
"sample_size": int(model.nobs),
|
||||
"residuals": residuals,
|
||||
"fit_plot": fit_plot, # Backward compatibility (first feature)
|
||||
"fit_plots_by_feature": fit_plots_by_feature, # All features
|
||||
"partial_regression_plots": partial_regression_plots, # Partial plots for multivariate
|
||||
"diagnostic_plot": residuals_vs_fitted,
|
||||
"equations": equations # LaTeX, Python, Excel formats
|
||||
}
|
||||
return summary
|
||||
except Exception as e:
|
||||
raise ValueError(f"Model calculation failed: {str(e)}")
|
||||
294
backend/generate_outlier_test_data.py
Normal file
294
backend/generate_outlier_test_data.py
Normal file
@@ -0,0 +1,294 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Générateur de données de test pour détecter les outliers UNIVARIÉS et MULTIVARIÉS.
|
||||
|
||||
Ce script crée un dataset avec :
|
||||
1. Des outliers univariés évidents (valeurs extrêmes dans une seule colonne)
|
||||
2. Des outliers multivariés (combinaisons de valeurs normales individuellement mais anormales ensemble)
|
||||
3. Des données normales pour la majorité
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
# Configuration
|
||||
np.random.seed(42)
|
||||
N_NORMAL = 100 # Nombre de lignes normales
|
||||
|
||||
def generate_outlier_dataset():
|
||||
"""
|
||||
Génère un dataset avec des outliers contrôlés pour tester les deux types de détection.
|
||||
"""
|
||||
data = []
|
||||
|
||||
# ========================================================================
|
||||
# 1. DONNÉES NORMALES (baseline)
|
||||
# ========================================================================
|
||||
print("📊 Génération des données normales...")
|
||||
|
||||
for i in range(N_NORMAL):
|
||||
data.append({
|
||||
"ID": i + 1,
|
||||
"Age": np.random.normal(40, 10), # Moyenne 40, écart-type 10
|
||||
"Salaire": np.random.normal(35000, 8000), # Moyenne 35k, écart-type 8k
|
||||
"Experience": np.random.normal(10, 4), # Moyenne 10 ans, écart-type 4
|
||||
"Performance": np.random.normal(75, 10), # Moyenne 75/100, écart-type 10
|
||||
"Heures_Sup": np.random.normal(5, 3), # Moyenne 5h/mois, écart-type 3
|
||||
"Type": "Normal"
|
||||
})
|
||||
|
||||
# ========================================================================
|
||||
# 2. OUTLIERS UNIVARIÉS (évidents dans UNE colonne)
|
||||
# ========================================================================
|
||||
print("🔴 Génération des outliers univariés...")
|
||||
|
||||
# Outlier 1: Âge extrême (150 ans - impossible)
|
||||
data.append({
|
||||
"ID": 101,
|
||||
"Age": 150,
|
||||
"Salaire": 38000,
|
||||
"Experience": 12,
|
||||
"Performance": 78,
|
||||
"Heures_Sup": 6,
|
||||
"Type": "Outlier_Uni_Age"
|
||||
})
|
||||
|
||||
# Outlier 2: Salaire extrêmement élevé (500k - 10x la normale)
|
||||
data.append({
|
||||
"ID": 102,
|
||||
"Age": 45,
|
||||
"Salaire": 500000,
|
||||
"Experience": 15,
|
||||
"Performance": 82,
|
||||
"Heures_Sup": 8,
|
||||
"Type": "Outlier_Uni_Salaire"
|
||||
})
|
||||
|
||||
# Outlier 3: Salaire négatif (impossible)
|
||||
data.append({
|
||||
"ID": 103,
|
||||
"Age": 35,
|
||||
"Salaire": -5000,
|
||||
"Experience": 8,
|
||||
"Performance": 72,
|
||||
"Heures_Sup": 4,
|
||||
"Type": "Outlier_Uni_Salaire_Neg"
|
||||
})
|
||||
|
||||
# Outlier 4: Performance > 100 (impossible)
|
||||
data.append({
|
||||
"ID": 104,
|
||||
"Age": 38,
|
||||
"Salaire": 42000,
|
||||
"Experience": 11,
|
||||
"Performance": 150,
|
||||
"Heures_Sup": 7,
|
||||
"Type": "Outlier_Uni_Perf"
|
||||
})
|
||||
|
||||
# Outlier 5: Heures supplémentaires négatives
|
||||
data.append({
|
||||
"ID": 105,
|
||||
"Age": 42,
|
||||
"Salaire": 36000,
|
||||
"Experience": 13,
|
||||
"Performance": 76,
|
||||
"Heures_Sup": -20,
|
||||
"Type": "Outlier_Uni_Heures"
|
||||
})
|
||||
|
||||
# ========================================================================
|
||||
# 3. OUTLIERS MULTIVARIÉS (normaux individuellement, anormaux ensemble)
|
||||
# ========================================================================
|
||||
print("🟣 Génération des outliers multivariés...")
|
||||
|
||||
# Outlier Multivarié 1: Jeune avec BEAUCUP d'expérience (impossible)
|
||||
# Age=25 (normal) mais Experience=30 (impossible pour cet âge)
|
||||
data.append({
|
||||
"ID": 201,
|
||||
"Age": 25,
|
||||
"Salaire": 32000,
|
||||
"Experience": 30,
|
||||
"Performance": 70,
|
||||
"Heures_Sup": 5,
|
||||
"Type": "Outlier_Multi_Age_Exp"
|
||||
})
|
||||
|
||||
# Outlier Multivarié 2: Haut salaire avec basse performance (suspect)
|
||||
# Salaire=80k (normal possible) mais Performance=40 (anormalement bas pour ce salaire)
|
||||
data.append({
|
||||
"ID": 202,
|
||||
"Age": 45,
|
||||
"Salaire": 80000,
|
||||
"Experience": 15,
|
||||
"Performance": 40,
|
||||
"Heures_Sup": 2,
|
||||
"Type": "Outlier_Multi_Salaire_Perf"
|
||||
})
|
||||
|
||||
# Outlier Multivarié 3: Faible expérience avec très haut salaire (suspect)
|
||||
data.append({
|
||||
"ID": 203,
|
||||
"Age": 28,
|
||||
"Salaire": 95000,
|
||||
"Experience": 1,
|
||||
"Performance": 85,
|
||||
"Heures_Sup": 15,
|
||||
"Type": "Outlier_Multi_Exp_Salaire"
|
||||
})
|
||||
|
||||
# Outlier Multivarié 4: Personne âgée avec junior-level tout
|
||||
data.append({
|
||||
"ID": 204,
|
||||
"Age": 65,
|
||||
"Salaire": 25000,
|
||||
"Experience": 1,
|
||||
"Performance": 60,
|
||||
"Heures_Sup": 0,
|
||||
"Type": "Outlier_Multi_Senior_Junior"
|
||||
})
|
||||
|
||||
# Outlier Multivarié 5: Performance parfaite avec 0 heures supp (rare)
|
||||
data.append({
|
||||
"ID": 205,
|
||||
"Age": 35,
|
||||
"Salaire": 40000,
|
||||
"Experience": 10,
|
||||
"Performance": 100,
|
||||
"Heures_Sup": 0,
|
||||
"Type": "Outlier_Multi_Perf_Heures"
|
||||
})
|
||||
|
||||
# Outlier Multivarié 6: Combinaison impossible - Junior avec salaire senior ET perf max
|
||||
data.append({
|
||||
"ID": 206,
|
||||
"Age": 22,
|
||||
"Salaire": 85000,
|
||||
"Experience": 0,
|
||||
"Performance": 95,
|
||||
"Heures_Sup": 0,
|
||||
"Type": "Outlier_Multi_Impossible"
|
||||
})
|
||||
|
||||
# ========================================================================
|
||||
# 4. CAS LIMITES (valeurs frontières)
|
||||
# ========================================================================
|
||||
print("🎯 Génération des cas limites...")
|
||||
|
||||
# Cas limite 1: Zéro partout (suspect mais pas impossible)
|
||||
data.append({
|
||||
"ID": 301,
|
||||
"Age": 22,
|
||||
"Salaire": 0,
|
||||
"Experience": 0,
|
||||
"Performance": 0,
|
||||
"Heures_Sup": 0,
|
||||
"Type": "Cas_Limie_Zeros"
|
||||
})
|
||||
|
||||
# Cas limite 2: Très âgé avec beaucoup d'expérience (normal)
|
||||
data.append({
|
||||
"ID": 302,
|
||||
"Age": 62,
|
||||
"Salaire": 70000,
|
||||
"Experience": 40,
|
||||
"Performance": 88,
|
||||
"Heures_Sup": 3,
|
||||
"Type": "Normal_Senior"
|
||||
})
|
||||
|
||||
# Cas limite 3: Salaire minimum légal (normal)
|
||||
data.append({
|
||||
"ID": 303,
|
||||
"Age": 25,
|
||||
"Salaire": 15000,
|
||||
"Experience": 2,
|
||||
"Performance": 65,
|
||||
"Heures_Sup": 2,
|
||||
"Type": "Normal_Salaire_Min"
|
||||
})
|
||||
|
||||
# Création du DataFrame
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
# Arrondir les valeurs numériques pour plus de clarté
|
||||
numeric_cols = ["Age", "Salaire", "Experience", "Performance", "Heures_Sup"]
|
||||
for col in numeric_cols:
|
||||
df[col] = df[col].round(2)
|
||||
|
||||
# Réordonner les colonnes
|
||||
df = df[["ID", "Age", "Experience", "Salaire", "Performance", "Heures_Sup", "Type"]]
|
||||
|
||||
return df
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print("🧪 GÉNÉRATEUR DE DONNÉES DE TEST - OUTLIERS UNIVARIÉS & MULTIVARIÉS")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
# Générer le dataset
|
||||
df = generate_outlier_dataset()
|
||||
|
||||
# Sauvegarder en CSV
|
||||
output_dir = Path("/home/sepehr/dev/Data_analysis/backend/test_data")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
csv_path = output_dir / "test_outliers_complete.csv"
|
||||
df.to_csv(csv_path, index=False)
|
||||
|
||||
excel_path = output_dir / "test_outliers_complete.xlsx"
|
||||
df.to_excel(excel_path, index=False)
|
||||
|
||||
# Afficher les statistiques
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("📊 STATISTIQUES DU DATASET")
|
||||
print("=" * 70)
|
||||
print(f"✅ Total lignes : {len(df)}")
|
||||
print(f"📈 Colonnes : {len(df.columns)}")
|
||||
print()
|
||||
print("🔴 Outliers univariés attendus : 5")
|
||||
print(" - ID 101: Âge = 150 ans")
|
||||
print(" - ID 102: Salaire = 500,000€")
|
||||
print(" - ID 103: Salaire = -5,000€ (négatif)")
|
||||
print(" - ID 104: Performance = 150 (>100)")
|
||||
print(" - ID 105: Heures_Sup = -20 (négatif)")
|
||||
print()
|
||||
print("🟣 Outliers multivariés attendus : 6")
|
||||
print(" - ID 201: Âge=25 avec Exp=30 (impossible)")
|
||||
print(" - ID 202: Salaire=80k avec Perf=40 (incohérent)")
|
||||
print(" - ID 203: Exp=1 avec Salaire=95k (suspect)")
|
||||
print(" - ID 204: Âge=65 avec Exp=1 (incohérent)")
|
||||
print(" - ID 205: Perf=100 avec Heures_Sup=0 (rare)")
|
||||
print(" - ID 206: Âge=22, Exp=0, Salaire=85k (impossible)")
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("💾 FICHIERS GÉNÉRÉS")
|
||||
print("=" * 70)
|
||||
print(f"📄 CSV : {csv_path}")
|
||||
print(f"📊 Excel : {excel_path}")
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("🎯 COMMENT TESTER")
|
||||
print("=" * 70)
|
||||
print("1. Importez le fichier 'test_outliers_complete.csv' dans l'application")
|
||||
print("2. Vérifiez que les colonnes sont bien détectées comme numériques")
|
||||
print("3. Les cercles ROUGES doivent apparaître sur les colonnes avec outliers univariés")
|
||||
print("4. Le cercle VIOLET doit apparaître (indicateur global multivarié)")
|
||||
print("5. Cliquez sur chaque indicateur pour voir les détails")
|
||||
print("6. Vérifiez la cohérence des outliers détectés")
|
||||
print()
|
||||
print("✨ Bon testing !")
|
||||
print("=" * 70)
|
||||
|
||||
# Afficher un aperçu des outliers
|
||||
print()
|
||||
print("📋 APERÇU DES OUTLIERS DANS LE DATASET :")
|
||||
print("-" * 70)
|
||||
outliers_df = df[df["Type"].str.contains("Outlier", case=False, na=False)]
|
||||
print(outliers_df[["ID", "Type", "Age", "Experience", "Salaire", "Performance", "Heures_Sup"]].to_string(index=False))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
266
backend/generate_test_data.py
Normal file
266
backend/generate_test_data.py
Normal file
@@ -0,0 +1,266 @@
|
||||
"""
|
||||
Générateur de données de test pour l'outil d'analyse de données.
|
||||
Crée des fichiers CSV et XLSX avec des corrélations et relations polynomiales.
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
# Configuration
|
||||
np.random.seed(42)
|
||||
n_samples = 500
|
||||
output_dir = Path(__file__).parent.parent / "test_data"
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
print(f"📁 Génération des données de test dans: {output_dir}")
|
||||
|
||||
# ============================================================================
|
||||
# 1. Dataset Ventes & Marketing (corrélations multiples, relations linéaires)
|
||||
# ============================================================================
|
||||
print("\n📊 Dataset: Ventes & Marketing")
|
||||
|
||||
ventes_data = {
|
||||
# Budget marketing mensuel (€)
|
||||
'budget_marketing': np.random.uniform(1000, 50000, n_samples),
|
||||
|
||||
# Heures de publicité TV
|
||||
'tv_ads_hours': np.random.uniform(5, 100, n_samples),
|
||||
|
||||
# Nombre de posts sur réseaux sociaux
|
||||
'social_media_posts': np.random.randint(10, 200, n_samples),
|
||||
|
||||
# Prix moyen du produit (€) - légère corrélation négative avec ventes
|
||||
'prix_moyen': np.random.uniform(20, 100, n_samples),
|
||||
|
||||
# Temperature moyenne (°C) - effet saisonnier
|
||||
'temperature': np.random.normal(20, 8, n_samples),
|
||||
|
||||
# Concurrence (index 1-10)
|
||||
'concurrence': np.random.randint(1, 11, n_samples),
|
||||
}
|
||||
|
||||
df_ventes = pd.DataFrame(ventes_data)
|
||||
|
||||
# Créer les ventes avec des relations réalistes
|
||||
# Ventes de base
|
||||
ventes_base = 5000
|
||||
|
||||
# Effet budget marketing (fortement positif)
|
||||
effet_budget = df_ventes['budget_marketing'] * 0.8
|
||||
|
||||
# Effet TV ads (modéré)
|
||||
effet_tv = df_ventes['tv_ads_hours'] * 25
|
||||
|
||||
# Effet social media (faible mais positif)
|
||||
effet_social = df_ventes['social_media_posts'] * 8
|
||||
|
||||
# Effet prix (négatif - prix plus élevé = moins de ventes)
|
||||
effet_prix = -df_ventes['prix_moyen'] * 15
|
||||
|
||||
# Effet température (positif - meilleur temps = plus de ventes)
|
||||
effet_temp = df_ventes['temperature'] * 30
|
||||
|
||||
# Effet concurrence (négatif)
|
||||
effet_concurrence = -df_ventes['concurrence'] * 200
|
||||
|
||||
# Bruit aléatoire
|
||||
bruit = np.random.normal(0, 500, n_samples)
|
||||
|
||||
# Ventes totales
|
||||
df_ventes['ventes'] = (ventes_base + effet_budget + effet_tv + effet_social +
|
||||
effet_prix + effet_temp + effet_concurrence + bruit)
|
||||
|
||||
# Arrondir
|
||||
df_ventes['ventes'] = df_ventes['ventes'].round(2)
|
||||
|
||||
# Sauvegarder
|
||||
ventes_csv = output_dir / "ventes_marketing.csv"
|
||||
ventes_excel = output_dir / "ventes_marketing.xlsx"
|
||||
|
||||
df_ventes.to_csv(ventes_csv, index=False)
|
||||
df_ventes.to_excel(ventes_excel, index=False)
|
||||
|
||||
print(f" ✅ {ventes_csv.name}: {n_samples} lignes, {len(df_ventes.columns)} colonnes")
|
||||
print(f" ✅ {ventes_excel.name}: {n_samples} lignes, {len(df_ventes.columns)} colonnes")
|
||||
|
||||
# ============================================================================
|
||||
# 2. Dataset Production Industriel (relation polynomiale / quadratique)
|
||||
# ============================================================================
|
||||
print("\n🏭 Dataset: Production Industrielle")
|
||||
|
||||
production_data = {
|
||||
# Température machine (°C)
|
||||
'temperature_machine': np.random.uniform(150, 250, n_samples),
|
||||
|
||||
# Pression (bar)
|
||||
'pression': np.random.uniform(2, 10, n_samples),
|
||||
|
||||
# Vitesse conveyor (m/min)
|
||||
'vitesse_conveyor': np.random.uniform(50, 150, n_samples),
|
||||
|
||||
# Humidité (%)
|
||||
'humidite': np.random.uniform(30, 70, n_samples),
|
||||
|
||||
# Qualité matière première (index 1-100)
|
||||
'qualite_matiere': np.random.uniform(60, 100, n_samples),
|
||||
}
|
||||
|
||||
df_production = pd.DataFrame(production_data)
|
||||
|
||||
# Relation polynomiale: température optimale ~200°C
|
||||
# Efficacité = a*(T - Toptimal)^2 + b
|
||||
T_optimal = 200
|
||||
df_production['efficacite_production'] = (
|
||||
-0.08 * (df_production['temperature_machine'] - T_optimal) ** 2
|
||||
+ 95 # Efficacité maximale
|
||||
+ df_production['pression'] * 1.5
|
||||
+ df_production['vitesse_conveyor'] * 0.1
|
||||
+ df_production['qualite_matiere'] * 0.3
|
||||
+ np.random.normal(0, 3, n_samples)
|
||||
)
|
||||
|
||||
# Borner entre 0 et 100
|
||||
df_production['efficacite_production'] = df_production['efficacite_production'].clip(0, 100).round(2)
|
||||
|
||||
# Défauts de production (relation inverse avec l'efficacité)
|
||||
df_production['defauts'] = (
|
||||
100 - df_production['efficacite_production']
|
||||
) * 0.5 + np.random.normal(0, 1, n_samples)
|
||||
df_production['defauts'] = df_production['defauts'].clip(0, None).round(2)
|
||||
|
||||
# Sauvegarder
|
||||
production_csv = output_dir / "production_industrielle.csv"
|
||||
production_excel = output_dir / "production_industrielle.xlsx"
|
||||
|
||||
df_production.to_csv(production_csv, index=False)
|
||||
df_production.to_excel(production_excel, index=False)
|
||||
|
||||
print(f" ✅ {production_csv.name}: {n_samples} lignes, {len(df_production.columns)} colonnes")
|
||||
print(f" ✅ {production_excel.name}: {n_samples} lignes, {len(df_production.columns)} colonnes")
|
||||
|
||||
# ============================================================================
|
||||
# 3. Dataset Santé & Fitness (relations mixtes)
|
||||
# ============================================================================
|
||||
print("\n🏃 Dataset: Santé & Fitness")
|
||||
|
||||
sante_data = {
|
||||
# Âge (années)
|
||||
'age': np.random.randint(18, 80, n_samples),
|
||||
|
||||
# Poids (kg)
|
||||
'poids': np.random.normal(75, 15, n_samples),
|
||||
|
||||
# Taille (cm)
|
||||
'taille': np.random.normal(170, 10, n_samples),
|
||||
|
||||
# Heures d'exercice par semaine
|
||||
'heures_exercice': np.random.uniform(0, 15, n_samples),
|
||||
|
||||
# Calories consommées par jour
|
||||
'calories_jour': np.random.normal(2200, 400, n_samples),
|
||||
|
||||
# Heures de sommeil
|
||||
'heures_sommeil': np.random.normal(7, 1.5, n_samples),
|
||||
|
||||
# Niveau de stress (1-10)
|
||||
'stress': np.random.randint(1, 11, n_samples),
|
||||
}
|
||||
|
||||
df_sante = pd.DataFrame(sante_data)
|
||||
|
||||
# IMC (BMI)
|
||||
df_sante['imc'] = (df_sante['poids'] / (df_sante['taille'] / 100) ** 2).round(2)
|
||||
|
||||
# Métabolisme de base (formule de Harris-Benedict simplifiée)
|
||||
# Hommes: BMR = 88.362 + (13.397 × kg) + (4.799 × cm) - (5.677 × age)
|
||||
df_sante['metabolisme_base'] = (
|
||||
88.362
|
||||
+ 13.397 * df_sante['poids']
|
||||
+ 4.799 * df_sante['taille']
|
||||
- 5.677 * df_sante['age']
|
||||
+ np.random.normal(0, 50, n_samples)
|
||||
).round(2)
|
||||
|
||||
# Niveau d'énergie (subjectif 1-10)
|
||||
df_sante['niveau_energie'] = (
|
||||
5
|
||||
+ 0.3 * df_sante['heures_exercice']
|
||||
- 0.2 * df_sante['stress']
|
||||
+ 0.15 * df_sante['heures_sommeil']
|
||||
- 0.01 * (df_sante['age'] - 30)
|
||||
+ np.random.normal(0, 1, n_samples)
|
||||
).clip(1, 10).round(2)
|
||||
|
||||
# Sauvegarder
|
||||
sante_csv = output_dir / "sante_fitness.csv"
|
||||
sante_excel = output_dir / "sante_fitness.xlsx"
|
||||
|
||||
df_sante.to_csv(sante_csv, index=False)
|
||||
df_sante.to_excel(sante_excel, index=False)
|
||||
|
||||
print(f" ✅ {sante_csv.name}: {n_samples} lignes, {len(df_sante.columns)} colonnes")
|
||||
print(f" ✅ {sante_excel.name}: {n_samples} lignes, {len(df_sante.columns)} colonnes")
|
||||
|
||||
# ============================================================================
|
||||
# 4. Dataset Finance (relation exponentielle)
|
||||
# ============================================================================
|
||||
print("\n💰 Dataset: Finance & Investissement")
|
||||
|
||||
finance_data = {
|
||||
# Montant investi (€)
|
||||
'montant_investi': np.random.uniform(1000, 100000, n_samples),
|
||||
|
||||
# Durée investissement (années)
|
||||
'duree_annees': np.random.uniform(1, 30, n_samples),
|
||||
|
||||
# Taux de rendement annuel (%)
|
||||
'taux_rendement': np.random.uniform(2, 15, n_samples),
|
||||
|
||||
# Risque (1-10)
|
||||
'niveau_risque': np.random.randint(1, 11, n_samples),
|
||||
}
|
||||
|
||||
df_finance = pd.DataFrame(finance_data)
|
||||
|
||||
# Valeur finale avec intérêts composés: A = P(1 + r)^t
|
||||
df_finance['valeur_finale'] = (
|
||||
df_finance['montant_investi'] *
|
||||
(1 + df_finance['taux_rendement'] / 100) ** df_finance['duree_annees']
|
||||
* (1 - 0.02 * df_finance['niveau_risque']) # Pénalité de risque
|
||||
+ np.random.normal(0, df_finance['montant_investi'] * 0.01, n_samples)
|
||||
).round(2)
|
||||
|
||||
# Profit
|
||||
df_finance['profit'] = (df_finance['valeur_finale'] - df_finance['montant_investi']).round(2)
|
||||
|
||||
# Rendement total (%)
|
||||
df_finance['rendement_total'] = (df_finance['profit'] / df_finance['montant_investi'] * 100).round(2)
|
||||
|
||||
# Sauvegarder
|
||||
finance_csv = output_dir / "finance_investissement.csv"
|
||||
finance_excel = output_dir / "finance_investissement.xlsx"
|
||||
|
||||
df_finance.to_csv(finance_csv, index=False)
|
||||
df_finance.to_excel(finance_excel, index=False)
|
||||
|
||||
print(f" ✅ {finance_csv.name}: {n_samples} lignes, {len(df_finance.columns)} colonnes")
|
||||
print(f" ✅ {finance_excel.name}: {n_samples} lignes, {len(df_finance.columns)} colonnes")
|
||||
|
||||
# ============================================================================
|
||||
# Résumé
|
||||
# ============================================================================
|
||||
print("\n" + "="*60)
|
||||
print("✅ Tous les fichiers de test ont été générés avec succès !")
|
||||
print("="*60)
|
||||
print(f"\n📂 Répertoire: {output_dir}")
|
||||
print("\n📋 Fichiers créés:")
|
||||
print(" 1. ventes_marketing.csv/xlsx - Corrélations multiples linéaires")
|
||||
print(" 2. production_industrielle.csv/xlsx - Relation polynomiale (quadratique)")
|
||||
print(" 3. sante_fitness.csv/xlsx - Relations mixtes + IMC calculé")
|
||||
print(" 4. finance_investissement.csv/xlsx - Relation exponentielle")
|
||||
print("\n💡 Utilisez ces fichiers pour tester:")
|
||||
print(" • Analyse de corrélation")
|
||||
print(" • Régression linéaire, polynomiale, exponentielle")
|
||||
print(" • Import CSV et Excel")
|
||||
print("="*60)
|
||||
32
backend/main.py
Normal file
32
backend/main.py
Normal file
@@ -0,0 +1,32 @@
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from app.api.v1.upload import router as upload_router
|
||||
from app.api.v1.analysis import router as analysis_router
|
||||
from app.api.v1.reports import router as reports_router
|
||||
|
||||
app = FastAPI(title="Data_analysis API", version="0.1.0")
|
||||
|
||||
# CORS configuration
|
||||
origins = ["*"] # Allow all origins for dev/homelab simplicity
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=origins,
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
expose_headers=["X-Column-Metadata", "Content-Disposition"]
|
||||
)
|
||||
|
||||
# Register routers
|
||||
app.include_router(upload_router, prefix="/api/v1")
|
||||
app.include_router(analysis_router, prefix="/api/v1")
|
||||
app.include_router(reports_router, prefix="/api/v1")
|
||||
|
||||
@app.get("/health")
|
||||
def health_check():
|
||||
return {"status": "ok", "service": "backend"}
|
||||
|
||||
@app.get("/")
|
||||
def read_root():
|
||||
return {"message": "Welcome to Data_analysis API"}
|
||||
22
backend/pyproject.toml
Normal file
22
backend/pyproject.toml
Normal file
@@ -0,0 +1,22 @@
|
||||
[project]
|
||||
name = "backend"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"fastapi>=0.128.0",
|
||||
"fpdf2>=2.8.5",
|
||||
"httpx>=0.28.1",
|
||||
"matplotlib>=3.10.0",
|
||||
"openpyxl>=3.1.5",
|
||||
"pandas>=2.3.3",
|
||||
"pyarrow>=22.0.0",
|
||||
"pydantic>=2.12.5",
|
||||
"pytest>=9.0.2",
|
||||
"python-multipart>=0.0.21",
|
||||
"scikit-learn>=1.8.0",
|
||||
"statsmodels>=0.14.6",
|
||||
"sympy>=1.13.0",
|
||||
"uvicorn>=0.40.0",
|
||||
]
|
||||
BIN
backend/test.pdf
Normal file
BIN
backend/test.pdf
Normal file
Binary file not shown.
115
backend/test_data/test_outliers_complete.csv
Normal file
115
backend/test_data/test_outliers_complete.csv
Normal file
@@ -0,0 +1,115 @@
|
||||
ID,Age,Experience,Salaire,Performance,Heures_Sup,Type
|
||||
1,44.97,12.59,33893.89,90.23,4.3,Normal
|
||||
2,37.66,13.07,47633.7,70.31,6.63,Normal
|
||||
3,35.37,10.97,31274.16,55.87,-0.17,Normal
|
||||
4,34.38,11.26,26897.35,65.92,0.76,Normal
|
||||
5,54.66,10.27,33193.79,60.75,3.37,Normal
|
||||
6,41.11,11.5,25792.05,68.99,4.12,Normal
|
||||
7,33.98,9.95,49818.23,64.42,7.47,Normal
|
||||
8,27.79,2.16,36670.91,61.72,5.59,Normal
|
||||
9,47.38,9.54,36370.95,71.99,0.56,Normal
|
||||
10,32.8,14.23,31314.89,78.44,-0.29,Normal
|
||||
11,43.24,7.29,31919.34,81.12,8.09,Normal
|
||||
12,49.31,8.76,28286.26,78.31,7.93,Normal
|
||||
13,35.21,5.57,33514.73,63.04,7.44,Normal
|
||||
14,53.56,14.01,34423.92,78.62,3.06,Normal
|
||||
15,43.61,9.86,47304.29,90.65,-2.86,Normal
|
||||
16,48.22,8.8,35696.38,75.92,-0.96,Normal
|
||||
17,37.8,15.91,37856.9,69.82,2.57,Normal
|
||||
18,34.98,11.32,42323.22,69.7,6.54,Normal
|
||||
19,40.97,7.19,42749.16,71.72,3.82,Normal
|
||||
20,25.36,11.04,37368.96,75.05,4.3,Normal
|
||||
21,25.85,8.63,31634.84,66.98,4.52,Normal
|
||||
22,44.04,10.7,50089.49,77.58,4.78,Normal
|
||||
23,20.81,10.24,34787.89,99.63,4.42,Normal
|
||||
24,43.02,5.33,34722.31,86.43,7.26,Normal
|
||||
25,47.91,15.61,27724.9,60.98,6.76,Normal
|
||||
26,61.9,7.73,27075.71,76.0,3.49,Normal
|
||||
27,24.49,5.75,35548.5,79.74,2.24,Normal
|
||||
28,55.5,8.71,28733.97,83.14,1.31,Normal
|
||||
29,42.27,3.57,45457.14,76.85,5.78,Normal
|
||||
30,47.82,4.72,25104.39,80.22,5.89,Normal
|
||||
31,42.5,7.28,37771.59,77.32,5.88,Normal
|
||||
32,32.86,11.9,49926.2,63.09,6.97,Normal
|
||||
33,30.25,14.63,41296.68,66.79,7.89,Normal
|
||||
34,44.13,17.59,41576.48,72.55,2.74,Normal
|
||||
35,31.1,9.69,28473.52,78.41,5.83,Normal
|
||||
36,48.27,15.81,35104.02,72.35,13.16,Normal
|
||||
37,46.26,5.72,28142.74,79.82,4.33,Normal
|
||||
38,47.14,9.71,38785.9,66.53,0.46,Normal
|
||||
39,35.53,10.86,41851.19,62.54,5.52,Normal
|
||||
40,43.85,10.61,27929.14,75.58,1.57,Normal
|
||||
41,43.58,14.33,39486.28,85.54,0.87,Normal
|
||||
42,30.62,12.06,39120.28,80.15,16.56,Normal
|
||||
43,45.71,13.82,44084.53,81.51,4.05,Normal
|
||||
44,47.59,9.05,28817.4,70.15,5.25,Normal
|
||||
45,63.15,12.75,20061.88,58.87,3.58,Normal
|
||||
46,50.89,5.69,35514.24,67.85,7.04,Normal
|
||||
47,32.7,10.18,36731.67,68.48,11.43,Normal
|
||||
48,46.34,10.75,18798.86,68.38,7.56,Normal
|
||||
49,32.07,12.02,34082.11,83.66,1.4,Normal
|
||||
50,36.65,7.39,31200.44,92.65,6.21,Normal
|
||||
51,27.39,18.49,42342.9,85.32,0.44,Normal
|
||||
52,35.16,7.17,45135.29,79.44,7.32,Normal
|
||||
53,30.73,-2.97,34523.8,64.76,4.24,Normal
|
||||
54,27.52,4.28,48059.29,70.6,5.39,Normal
|
||||
55,54.41,14.65,23513.1,75.1,2.06,Normal
|
||||
56,44.62,7.6,36592.48,75.7,3.84,Normal
|
||||
57,41.14,16.34,40297.05,62.62,11.4,Normal
|
||||
58,20.48,12.35,33785.72,77.81,3.13,Normal
|
||||
59,37.92,7.64,31055.99,83.5,6.07,Normal
|
||||
60,33.07,11.23,42196.8,83.13,6.89,Normal
|
||||
61,31.71,12.99,30518.55,81.1,4.94,Normal
|
||||
62,41.17,7.63,45221.32,80.47,4.39,Normal
|
||||
63,37.82,13.3,43790.21,83.14,8.92,Normal
|
||||
64,40.21,8.76,40455.62,78.24,4.61,Normal
|
||||
65,40.97,6.73,39761.26,95.92,1.98,Normal
|
||||
66,27.86,13.17,44264.89,81.24,6.89,Normal
|
||||
67,39.88,10.3,27821.97,68.23,7.93,Normal
|
||||
68,38.53,8.71,28396.02,79.13,3.31,Normal
|
||||
69,31.78,10.98,36949.5,69.93,3.59,Normal
|
||||
70,42.32,4.37,23415.33,67.82,4.36,Normal
|
||||
71,43.11,13.43,46802.85,73.4,4.94,Normal
|
||||
72,29.97,8.85,34851.89,78.23,2.52,Normal
|
||||
73,45.19,9.56,47261.91,79.02,7.07,Normal
|
||||
74,35.99,10.05,36792.74,75.98,2.68,Normal
|
||||
75,40.25,15.8,38983.99,84.59,11.46,Normal
|
||||
76,32.33,10.73,41978.57,96.9,2.58,Normal
|
||||
77,31.6,1.5,30204.86,69.74,2.72,Normal
|
||||
78,41.5,17.5,37734.05,84.5,3.27,Normal
|
||||
79,31.02,4.72,38935.35,93.31,8.54,Normal
|
||||
80,35.31,15.42,21294.92,73.85,8.71,Normal
|
||||
81,24.06,10.02,30205.0,75.47,3.65,Normal
|
||||
82,46.23,9.43,26459.04,76.2,6.54,Normal
|
||||
83,47.12,3.86,26002.86,87.78,6.0,Normal
|
||||
84,32.52,10.46,47409.22,86.79,5.2,Normal
|
||||
85,60.61,9.0,49042.73,84.72,6.94,Normal
|
||||
86,53.69,12.74,27280.61,85.58,-0.28,Normal
|
||||
87,28.17,8.92,18686.14,82.18,9.51,Normal
|
||||
88,40.74,4.48,48028.92,57.97,4.83,Normal
|
||||
89,43.84,1.73,34738.44,74.11,1.09,Normal
|
||||
90,46.7,6.24,37932.79,69.86,1.82,Normal
|
||||
91,39.37,6.06,42641.14,80.04,3.41,Normal
|
||||
92,32.07,5.86,34143.76,69.46,1.41,Normal
|
||||
93,59.65,7.2,35282.11,77.14,4.66,Normal
|
||||
94,37.79,13.03,39913.33,69.69,3.27,Normal
|
||||
95,37.25,3.94,16584.63,88.67,9.93,Normal
|
||||
96,37.51,11.25,39612.46,105.79,8.36,Normal
|
||||
97,38.72,3.57,27355.68,77.03,2.73,Normal
|
||||
98,25.78,5.67,29827.42,91.87,7.64,Normal
|
||||
99,39.92,10.31,46839.55,66.39,9.57,Normal
|
||||
100,45.39,9.24,26702.03,66.24,0.85,Normal
|
||||
101,150.0,12.0,38000.0,78.0,6.0,Outlier_Uni_Age
|
||||
102,45.0,15.0,500000.0,82.0,8.0,Outlier_Uni_Salaire
|
||||
103,35.0,8.0,-5000.0,72.0,4.0,Outlier_Uni_Salaire_Neg
|
||||
104,38.0,11.0,42000.0,150.0,7.0,Outlier_Uni_Perf
|
||||
105,42.0,13.0,36000.0,76.0,-20.0,Outlier_Uni_Heures
|
||||
201,25.0,30.0,32000.0,70.0,5.0,Outlier_Multi_Age_Exp
|
||||
202,45.0,15.0,80000.0,40.0,2.0,Outlier_Multi_Salaire_Perf
|
||||
203,28.0,1.0,95000.0,85.0,15.0,Outlier_Multi_Exp_Salaire
|
||||
204,65.0,1.0,25000.0,60.0,0.0,Outlier_Multi_Senior_Junior
|
||||
205,35.0,10.0,40000.0,100.0,0.0,Outlier_Multi_Perf_Heures
|
||||
206,22.0,0.0,85000.0,95.0,0.0,Outlier_Multi_Impossible
|
||||
301,22.0,0.0,0.0,0.0,0.0,Cas_Limie_Zeros
|
||||
302,62.0,40.0,70000.0,88.0,3.0,Normal_Senior
|
||||
303,25.0,2.0,15000.0,65.0,2.0,Normal_Salaire_Min
|
||||
|
BIN
backend/test_data/test_outliers_complete.xlsx
Normal file
BIN
backend/test_data/test_outliers_complete.xlsx
Normal file
Binary file not shown.
80
backend/tests/test_analysis.py
Normal file
80
backend/tests/test_analysis.py
Normal file
@@ -0,0 +1,80 @@
|
||||
from fastapi.testclient import TestClient
|
||||
from main import app
|
||||
|
||||
client = TestClient(app)
|
||||
|
||||
def test_outlier_detection_univariate():
|
||||
# Data with one clear outlier (100)
|
||||
data = [
|
||||
{"val": 10}, {"val": 11}, {"val": 12}, {"val": 10}, {"val": 100}
|
||||
]
|
||||
request_data = {
|
||||
"data": data,
|
||||
"columns": ["val"],
|
||||
"method": "univariate"
|
||||
}
|
||||
|
||||
response = client.post("/api/v1/analysis/detect-outliers", json=request_data)
|
||||
assert response.status_code == 200
|
||||
json_resp = response.json()
|
||||
assert json_resp["total_count"] == 1
|
||||
assert json_resp["outliers"][0]["index"] == 4
|
||||
assert "IQR bounds" in json_resp["outliers"][0]["reasons"][0]
|
||||
|
||||
def test_outlier_detection_multivariate():
|
||||
# Data with a multivariate anomaly
|
||||
data = [
|
||||
{"x": 1, "y": 1}, {"x": 1.1, "y": 0.9}, {"x": 0.9, "y": 1.1},
|
||||
{"x": 10, "y": 10} # Anomaly
|
||||
]
|
||||
request_data = {
|
||||
"data": data,
|
||||
"columns": ["x", "y"],
|
||||
"method": "multivariate"
|
||||
}
|
||||
|
||||
response = client.post("/api/v1/analysis/detect-outliers", json=request_data)
|
||||
assert response.status_code == 200
|
||||
json_resp = response.json()
|
||||
assert json_resp["total_count"] >= 1
|
||||
|
||||
def test_feature_importance():
|
||||
data = [
|
||||
{"x1": 1, "x2": 10, "y": 2},
|
||||
{"x1": 2, "x2": 20, "y": 4},
|
||||
{"x1": 3, "x2": 30, "y": 6},
|
||||
{"x1": 4, "x2": 40, "y": 8},
|
||||
{"x1": 5, "x2": 50, "y": 10}
|
||||
]
|
||||
request_data = {
|
||||
"data": data,
|
||||
"features": ["x1", "x2"],
|
||||
"target": "y"
|
||||
}
|
||||
response = client.post("/api/v1/analysis/feature-importance", json=request_data)
|
||||
assert response.status_code == 200
|
||||
json_resp = response.json()
|
||||
assert len(json_resp["importances"]) == 2
|
||||
assert "feature" in json_resp["importances"][0]
|
||||
assert "score" in json_resp["importances"][0]
|
||||
|
||||
def test_run_regression():
|
||||
data = [
|
||||
{"x": 1, "y": 2.1},
|
||||
{"x": 2, "y": 3.9},
|
||||
{"x": 3, "y": 6.2},
|
||||
{"x": 4, "y": 8.1},
|
||||
{"x": 5, "y": 10.3}
|
||||
]
|
||||
request_data = {
|
||||
"data": data,
|
||||
"x_features": ["x"],
|
||||
"y_target": "y",
|
||||
"model_type": "linear"
|
||||
}
|
||||
response = client.post("/api/v1/analysis/run-regression", json=request_data)
|
||||
assert response.status_code == 200
|
||||
json_resp = response.json()
|
||||
assert "results" in json_resp
|
||||
assert json_resp["results"]["r_squared"] > 0.9
|
||||
assert "const" in json_resp["results"]["coefficients"]
|
||||
37
backend/tests/test_upload.py
Normal file
37
backend/tests/test_upload.py
Normal file
@@ -0,0 +1,37 @@
|
||||
from fastapi.testclient import TestClient
|
||||
import pandas as pd
|
||||
import io
|
||||
import pyarrow as pa
|
||||
from main import app
|
||||
|
||||
client = TestClient(app)
|
||||
|
||||
def test_health_check():
|
||||
response = client.get("/health")
|
||||
assert response.status_code == 200
|
||||
assert response.json() == {"status": "ok", "service": "backend"}
|
||||
|
||||
def test_upload_csv():
|
||||
# Create a dummy CSV
|
||||
csv_content = "name,age\nAlice,30\nBob,25"
|
||||
file = ("test.csv", csv_content, "text/csv")
|
||||
|
||||
response = client.post("/api/v1/upload", files={"file": file})
|
||||
|
||||
assert response.status_code == 200
|
||||
assert response.headers["content-type"] == "application/vnd.apache.arrow.stream"
|
||||
assert "X-Column-Metadata" in response.headers
|
||||
|
||||
# Verify Arrow data
|
||||
buffer = io.BytesIO(response.content)
|
||||
with pa.ipc.open_stream(buffer) as reader:
|
||||
table = reader.read_all()
|
||||
df = table.to_pandas()
|
||||
assert df.shape == (2, 2)
|
||||
assert list(df.columns) == ["name", "age"]
|
||||
|
||||
def test_upload_invalid_format():
|
||||
file = ("test.txt", "invalid content", "text/plain")
|
||||
response = client.post("/api/v1/upload", files={"file": file})
|
||||
assert response.status_code == 400
|
||||
assert "Only .xlsx, .xls and .csv files are supported" in response.json()["detail"]
|
||||
1101
backend/uv.lock
generated
Normal file
1101
backend/uv.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user