initial commit

This commit is contained in:
2026-01-11 22:56:02 +01:00
commit 6426ddd0ab
408 changed files with 95071 additions and 0 deletions

View File

@@ -0,0 +1,4 @@
Signature: 8a477f597d28d172789f06886806bc55
# This file is a cache directory tag created by pytest.
# For information about cache directory tags, see:
# https://bford.info/cachedir/spec.html

View File

@@ -0,0 +1,8 @@
# pytest cache directory #
This directory contains data from the pytest's cache plugin,
which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
**Do not** commit this to version control.
See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.

View File

@@ -0,0 +1,3 @@
{
"tests/test_upload.py::TestClient": true
}

9
backend/.pytest_cache/v/cache/nodeids vendored Normal file
View File

@@ -0,0 +1,9 @@
[
"tests/test_analysis.py::test_feature_importance",
"tests/test_analysis.py::test_outlier_detection_multivariate",
"tests/test_analysis.py::test_outlier_detection_univariate",
"tests/test_analysis.py::test_run_regression",
"tests/test_upload.py::test_health_check",
"tests/test_upload.py::test_upload_csv",
"tests/test_upload.py::test_upload_invalid_format"
]

1
backend/.python-version Normal file
View File

@@ -0,0 +1 @@
3.12

20
backend/Dockerfile Normal file
View File

@@ -0,0 +1,20 @@
# Simple Dockerfile - install everything in one place
FROM python:3.12-slim-bookworm
WORKDIR /app
ENV PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1
# Install pip packages directly (no uv complexity)
COPY pyproject.toml ./
RUN pip install --no-cache-dir -e .
# Copy application
COPY . .
# Expose port
EXPOSE 8000
# Run directly
CMD ["python", "-m", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

0
backend/README.md Normal file
View File

View File

@@ -0,0 +1,147 @@
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from typing import List, Any, Dict, Optional
import pandas as pd
import numpy as np
from app.core.engine.clean import detect_univariate_outliers, detect_multivariate_outliers, merge_outliers, merge_outliers_structured
from app.core.engine.stats import calculate_correlation_matrix, calculate_feature_importance, run_regression_analysis
router = APIRouter(prefix="/analysis", tags=["analysis"])
class TypeValidationRequest(BaseModel):
data: List[Any]
target_type: str
class OutlierDetectionRequest(BaseModel):
data: List[Dict[str, Optional[Any]]]
columns: List[str]
method: str = "both"
excluded_indices: List[int] = [] # Rows to exclude from outlier detection
class CorrelationRequest(BaseModel):
data: List[Dict[str, Optional[Any]]]
columns: List[str]
method: str = "pearson" # pearson, spearman, kendall
min_threshold: Optional[float] = None # Optional minimum correlation threshold
include_pvalues: bool = True
class FeatureImportanceRequest(BaseModel):
data: List[Dict[str, Optional[Any]]]
features: List[str]
target: str
class RegressionRequest(BaseModel):
data: List[Dict[str, Optional[Any]]]
x_features: List[str]
y_target: str
model_type: str = "linear"
# New Engineering Parameters
poly_degree: int = 1 # Default to linear
include_interactions: bool = False
@router.post("/validate-type")
async def validate_type_conversion(request: TypeValidationRequest):
s = pd.Series(request.data)
try:
if request.target_type == "numeric":
pd.to_numeric(s, errors='raise')
elif request.target_type == "date":
pd.to_datetime(s, errors='raise')
return {"status": "ok", "valid": True}
except Exception as e:
return {"status": "error", "valid": False, "message": str(e)}
@router.post("/detect-outliers")
async def detect_outliers(request: OutlierDetectionRequest):
if not request.data:
return {"outliers": []}
df = pd.DataFrame(request.data).fillna(np.nan)
# Pass excluded indices to detection functions
uni_results = detect_univariate_outliers(
df, request.columns, request.excluded_indices
) if request.method in ["univariate", "both"] else {}
multi_results = detect_multivariate_outliers(
df, request.columns, request.excluded_indices
) if request.method in ["multivariate", "both"] else {}
# Use the new structured merge function
structured = merge_outliers_structured(uni_results, multi_results)
return {
"status": "ok",
"total_count": len(structured["all"]),
"outliers": structured["all"], # Backwards compatibility
"univariate": structured["univariate"], # New: Column-specific outliers
"multivariate": structured["multivariate"] # New: Global outliers
}
@router.post("/correlation")
async def get_correlation(request: CorrelationRequest):
if not request.data or not request.columns:
return {
"status": "error",
"message": "Data and columns are required",
"result": {"matrix": [], "pvalues": [], "metadata": {}}
}
df = pd.DataFrame(request.data).fillna(np.nan)
# Validate method parameter
valid_methods = ['pearson', 'spearman', 'kendall']
if request.method not in valid_methods:
raise HTTPException(
status_code=400,
detail=f"Invalid method. Choose from: {', '.join(valid_methods)}"
)
try:
result = calculate_correlation_matrix(
df,
request.columns,
method=request.method,
min_threshold=request.min_threshold,
include_pvalues=request.include_pvalues
)
# Add summary statistics
from app.core.engine.stats import get_correlation_summary
summary = get_correlation_summary(result)
return {
"status": "ok",
"result": result,
"summary": summary
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Correlation calculation failed: {str(e)}")
@router.post("/feature-importance")
async def get_feature_importance(request: FeatureImportanceRequest):
if not request.data or not request.features or not request.target: return {"importances": []}
df = pd.DataFrame(request.data).fillna(np.nan)
return {"status": "ok", "importances": calculate_feature_importance(df, request.features, request.target)}
@router.post("/run-regression")
async def run_regression(request: RegressionRequest):
if not request.data or not request.x_features or not request.y_target:
raise HTTPException(status_code=400, detail="Incomplete parameters.")
df = pd.DataFrame(request.data).fillna(np.nan)
try:
results = run_regression_analysis(
df,
request.x_features,
request.y_target,
request.model_type,
request.poly_degree,
request.include_interactions
)
return {"status": "ok", "results": results}
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
raise HTTPException(status_code=500, detail=f"Internal Analysis Error: {str(e)}")

View File

@@ -0,0 +1,33 @@
from fastapi import APIRouter, HTTPException, Response
from pydantic import BaseModel
from typing import Dict, Any, List
from app.core.engine.reports import create_pdf_report
router = APIRouter(prefix="/reports", tags=["reporting"])
class ExportRequest(BaseModel):
project_name: str
results: Dict[str, Any]
audit_trail: Dict[str, Any]
@router.post("/export")
async def export_report(request: ExportRequest):
"""
Generates and returns a PDF report.
"""
try:
pdf_bytes = create_pdf_report(
request.project_name,
request.results,
request.audit_trail
)
return Response(
content=pdf_bytes,
media_type="application/pdf",
headers={
"Content-Disposition": f"attachment; filename=Report_{request.project_name}.pdf"
}
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@@ -0,0 +1,44 @@
from fastapi import APIRouter, UploadFile, File, HTTPException
from fastapi.responses import StreamingResponse
import io
import json
from app.core.engine.ingest import parse_file, get_column_metadata, dataframe_to_arrow_stream
router = APIRouter(prefix="/upload", tags=["ingestion"])
@router.post("")
async def upload_file(file: UploadFile = File(...)):
"""
Endpoint to upload Excel/CSV files and receive an Apache Arrow stream.
Metadata about columns is sent in the X-Column-Metadata header.
"""
# 1. Validation
if not file.filename.endswith(('.xlsx', '.xls', '.csv')):
raise HTTPException(status_code=400, detail="Only .xlsx, .xls and .csv files are supported.")
try:
content = await file.read()
# 2. Parsing
df = parse_file(content, file.filename)
# 3. Metadata Extraction
metadata = get_column_metadata(df)
# 4. Conversion to Arrow
arrow_bytes = dataframe_to_arrow_stream(df)
# We use a StreamingResponse to send the binary Arrow data.
# Metadata is sent as a custom header (JSON stringified).
return StreamingResponse(
io.BytesIO(arrow_bytes),
media_type="application/vnd.apache.arrow.stream",
headers={
"X-Column-Metadata": json.dumps(metadata),
"Access-Control-Expose-Headers": "X-Column-Metadata"
}
)
except Exception as e:
# In a real app, we'd log this properly
raise HTTPException(status_code=400, detail=f"Error processing file: {str(e)}")

View File

@@ -0,0 +1,165 @@
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from typing import List, Dict, Any
def detect_univariate_outliers(df: pd.DataFrame, columns: List[str], excluded_indices: List[int] = None) -> Dict[int, List[str]]:
"""
Detects outliers in specific numeric columns using the Interquartile Range (IQR) method.
Args:
df: Input DataFrame
columns: List of column names to analyze
excluded_indices: List of row indices to exclude from detection
Returns:
Dictionary of {original_row_index: [reasons]}
"""
# Exclude specified rows if provided
if excluded_indices:
df = df[~df.index.isin(excluded_indices)]
outliers = {}
for col in columns:
if col not in df.columns:
continue
s = pd.to_numeric(df[col], errors='coerce')
q1 = s.quantile(0.25)
q3 = s.quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
flags = (s < lower_bound) | (s > upper_bound)
indices = df.index[flags].tolist()
for idx in indices:
val = df.at[idx, col]
reason = f"Column '{col}' value {val} is outside IQR bounds [{lower_bound:.2f}, {upper_bound:.2f}]"
if idx not in outliers:
outliers[idx] = []
outliers[idx].append(reason)
return outliers
def detect_multivariate_outliers(df: pd.DataFrame, columns: List[str], excluded_indices: List[int] = None) -> Dict[int, List[str]]:
"""
Detects anomalies across multiple numeric columns using Isolation Forest.
Args:
df: Input DataFrame
columns: List of column names to analyze
excluded_indices: List of row indices to exclude from detection (already filtered out)
Returns:
Dictionary of {original_row_index: [reasons]}
"""
# Store original indices
original_indices = df.index.tolist()
# Exclude specified rows if provided
if excluded_indices:
df = df[~df.index.isin(excluded_indices)]
# Select only relevant numeric columns
numeric_df = df[columns].apply(pd.to_numeric, errors='coerce')
if numeric_df.empty:
return {}
# Get rows with all values present (no NaNs in selected columns)
valid_mask = numeric_df.notna().all(axis=1)
numeric_df_clean = numeric_df[valid_mask]
if numeric_df_clean.empty:
return {}
# Fit Isolation Forest
model = IsolationForest(contamination='auto', random_state=42)
preds = model.fit_predict(numeric_df_clean)
# IsolationForest returns -1 for outliers
# Get the indices from the clean DataFrame (these are the original indices)
outlier_indices = numeric_df_clean.index[preds == -1].tolist()
return {int(idx): ["Multivariate anomaly detected by Isolation Forest"] for idx in outlier_indices}
def merge_outliers(uni: Dict[int, List[str]], multi: Dict[int, List[str]]) -> List[Dict[str, Any]]:
"""
Merges results into a flat list of outlier objects.
DEPRECATED: Use merge_outliers_structured instead for better type separation.
"""
all_indices = set(uni.keys()) | set(multi.keys())
results = []
for idx in sorted(all_indices):
reasons = uni.get(idx, []) + multi.get(idx, [])
results.append({
"index": int(idx),
"reasons": reasons
})
return results
def merge_outliers_structured(uni: Dict[int, List[str]], multi: Dict[int, List[str]]) -> Dict[str, Any]:
"""
Merges and separates outliers by type for better frontend handling.
Returns:
Dictionary with:
- 'univariate': Dict mapping column names to their specific outliers
- 'multivariate': List of outliers that affect multiple columns
- 'all': Flat list of all outliers (for backwards compatibility)
This structure allows the frontend to:
1. Show column-specific outliers when clicking a column header
2. Show global/multivariate outliers in a separate view
3. Clearly distinguish between local and global anomalies
"""
# Extract column names from univariate reasons
column_outliers: Dict[str, List[Dict[str, Any]]] = {}
for idx, reasons in uni.items():
for reason in reasons:
# Extract column name from reason string
# Format: "Column 'Price' value 100 is outside..."
if "Column '" in reason:
col_start = reason.index("Column '") + 8
col_end = reason.index("'", col_start)
col_name = reason[col_start:col_end]
if col_name not in column_outliers:
column_outliers[col_name] = []
# Check if this row index already exists for this column
existing = next((x for x in column_outliers[col_name] if x["index"] == idx), None)
if not existing:
column_outliers[col_name].append({
"index": int(idx),
"reasons": [reason]
})
else:
existing["reasons"].append(reason)
# Prepare multivariate outliers
multivariate_list = [
{"index": int(idx), "reasons": reasons}
for idx, reasons in multi.items()
]
# Prepare legacy flat format (backwards compatibility)
all_indices = set(uni.keys()) | set(multi.keys())
all_outliers = []
for idx in sorted(all_indices):
reasons = uni.get(idx, []) + multi.get(idx, [])
all_outliers.append({
"index": int(idx),
"reasons": reasons
})
return {
"univariate": column_outliers,
"multivariate": multivariate_list,
"all": all_outliers
}

View File

@@ -0,0 +1,56 @@
import pandas as pd
import pyarrow as pa
import io
from typing import Tuple, Dict, Any
def parse_file(file_content: bytes, filename: str) -> pd.DataFrame:
"""
Parses the uploaded file (Excel or CSV) into a Pandas DataFrame.
"""
file_obj = io.BytesIO(file_content)
if filename.endswith(('.xlsx', '.xls')):
df = pd.read_excel(file_obj)
elif filename.endswith('.csv'):
# Attempt to detect common delimiters if needed, default to comma
df = pd.read_csv(file_obj)
else:
raise ValueError(f"Unsupported file format: {filename}")
# Basic hygiene: strip whitespace from headers
df.columns = [str(c).strip() for c in df.columns]
return df
def get_column_metadata(df: pd.DataFrame) -> list:
"""
Returns a list of column metadata (name and inferred type).
"""
metadata = []
for col in df.columns:
dtype = str(df[col].dtype)
# Simplify types for the frontend
inferred_type = "numeric"
if "object" in dtype or "string" in dtype:
inferred_type = "categorical"
elif "datetime" in dtype:
inferred_type = "date"
elif "bool" in dtype:
inferred_type = "boolean"
metadata.append({
"name": col,
"type": inferred_type,
"native_type": dtype
})
return metadata
def dataframe_to_arrow_stream(df: pd.DataFrame) -> bytes:
"""
Converts a Pandas DataFrame to an Apache Arrow IPC stream.
"""
table = pa.Table.from_pandas(df)
sink = pa.BufferOutputStream()
with pa.ipc.new_stream(sink, table.schema) as writer:
writer.write_table(table)
return sink.getvalue().to_pybytes()

View File

@@ -0,0 +1,223 @@
from fpdf import FPDF
from datetime import datetime
from io import BytesIO
import pandas as pd
import platform
import sklearn
import statsmodels
import os
import matplotlib
matplotlib.use('Agg') # Use non-GUI backend
import matplotlib.pyplot as plt
import tempfile
class AnalysisReport(FPDF):
def header(self):
try:
# Add Unicode font support for accented characters
self.set_font('Arial', 'B', 15)
self.set_text_color(79, 70, 229) # Indigo 600
self.cell(0, 10, 'Data_analysis - Rapport de Validation', 0, 1, 'L')
self.set_draw_color(226, 232, 240)
self.line(10, 22, 200, 22)
self.ln(10)
except Exception as e:
print(f"Header error: {e}")
def footer(self):
try:
self.set_y(-15)
self.set_font('Arial', 'I', 8)
self.set_text_color(148, 163, 184)
self.cell(0, 10, f'Page {self.page_no()} | Genere le {datetime.now().strftime("%Y-%m-%d %H:%M")}', 0, 0, 'C')
except Exception as e:
print(f"Footer error: {e}")
def create_pdf_report(project_name: str, results: dict, audit_trail: dict) -> bytes:
try:
pdf = AnalysisReport()
pdf.add_page()
# 1. Summary
pdf.set_font('Arial', 'B', 12)
pdf.set_text_color(51, 65, 85)
pdf.cell(0, 10, f"Projet : {project_name}", 0, 1)
pdf.ln(5)
# Handle missing model_type
model_type = results.get('model_type', 'Regression')
if isinstance(model_type, list):
model_type = model_type[0] if model_type else 'Regression'
pdf.set_font('Arial', '', 10)
pdf.cell(0, 8, f"Modele : {model_type}", 0, 1)
# Handle r_squared safely
r_squared = results.get('r_squared', 0)
if r_squared is None:
r_squared = 0
pdf.cell(0, 8, f"Precision (R²) : {float(r_squared):.4f}", 0, 1)
# Handle sample_size safely
sample_size = results.get('sample_size', 0)
if sample_size is None:
sample_size = 0
pdf.cell(0, 8, f"Taille de l'echantillon : {int(sample_size)}", 0, 1)
pdf.ln(10)
# 2. Coefficients Table
pdf.set_font('Arial', 'B', 11)
pdf.cell(0, 10, "Coefficients du Modele", 0, 1)
pdf.set_font('Arial', 'B', 9)
pdf.set_fill_color(248, 250, 252)
pdf.cell(80, 8, "Feature", 1, 0, 'L', True)
pdf.cell(50, 8, "Coefficient", 1, 0, 'R', True)
pdf.cell(50, 8, "P-Value", 1, 1, 'R', True)
# Get coefficients and p_values safely
coefficients = results.get('coefficients', {})
p_values = results.get('p_values', {})
if coefficients:
pdf.set_font('Arial', '', 9)
for name, coef in coefficients.items():
# Convert coef to float safely
try:
coef_val = float(coef)
except (TypeError, ValueError):
coef_val = 0.0
# Get p-value safely
p_val = p_values.get(name, 1.0)
try:
p_val = float(p_val)
except (TypeError, ValueError):
p_val = 1.0
pdf.cell(80, 8, str(name), 1)
pdf.cell(50, 8, f"{coef_val:.4f}", 1, 0, 'R')
if p_val < 0.05:
pdf.set_text_color(16, 185, 129) # Emerald
else:
pdf.set_text_color(244, 63, 94) # Rose
pdf.cell(50, 8, f"{p_val:.4f}", 1, 1, 'R')
pdf.set_text_color(51, 65, 85)
else:
pdf.set_font('Arial', '', 9)
pdf.cell(0, 8, "Aucun coefficient disponible", 0, 1)
pdf.ln(15)
# 3. Visualization Charts
if 'fit_plot' in results and len(results['fit_plot']) > 0:
pdf.set_font('Arial', 'B', 11)
pdf.cell(0, 10, "Courbe de Regression", 0, 1)
pdf.ln(5)
# Create fit plot
fit_data = results['fit_plot']
x_vals = [p['x'] for p in fit_data]
y_real = [p['real'] for p in fit_data]
y_pred = [p['pred'] for p in fit_data]
plt.figure(figsize=(10, 6))
plt.scatter(x_vals, y_real, alpha=0.6, color='#4f46e5', label='Données réelles', s=50)
plt.plot(x_vals, y_pred, color='#ef4444', linewidth=2, label='Courbe de régression')
plt.xlabel('Valeur X', fontsize=12)
plt.ylabel('Valeur Y', fontsize=12)
plt.title('Ajustement du Modèle', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
# Save plot to temp file and add to PDF
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
plt.savefig(tmp.name, dpi=150, bbox_inches='tight')
plt.close()
pdf.image(tmp.name, x=10, w=190)
os.unlink(tmp.name)
pdf.ln(10)
# Residuals plot
if 'diagnostic_plot' in results and len(results['diagnostic_plot']) > 0:
pdf.set_font('Arial', 'B', 11)
pdf.cell(0, 10, "Graphique des Residus", 0, 1)
pdf.ln(5)
residuals_data = results['diagnostic_plot']
fitted = [p['fitted'] for p in residuals_data]
residuals = [p['residual'] for p in residuals_data]
plt.figure(figsize=(10, 6))
plt.scatter(fitted, residuals, alpha=0.6, color='#4f46e5', s=50)
plt.axhline(y=0, color='#ef4444', linestyle='--', linewidth=2)
plt.xlabel('Valeurs Ajustees', fontsize=12)
plt.ylabel('Residus', fontsize=12)
plt.title('Graphique des Residus', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
plt.savefig(tmp.name, dpi=150, bbox_inches='tight')
plt.close()
pdf.image(tmp.name, x=10, w=190)
os.unlink(tmp.name)
pdf.ln(10)
# 4. Audit Trail (Reproducibility)
pdf.set_font('Arial', 'B', 11)
pdf.cell(0, 10, "Piste d'Audit & Reproductibilite", 0, 1)
pdf.set_font('Arial', '', 8)
pdf.set_text_color(100, 116, 139)
# Cleaning steps
excluded_count = audit_trail.get('excluded_rows_count', 0)
if excluded_count is None:
excluded_count = 0
pdf.multi_cell(0, 6, f"- Nettoyage : {int(excluded_count)} lignes exclues de l'analyse.")
# Environment
pdf.ln(5)
pdf.set_font('Arial', 'B', 8)
pdf.cell(0, 6, "Environnement Technique :", 0, 1)
pdf.set_font('Arial', '', 8)
pdf.cell(0, 5, f"- Python : {platform.python_version()}", 0, 1)
pdf.cell(0, 5, f"- Pandas : {pd.__version__}", 0, 1)
# Try to get sklearn version safely
try:
pdf.cell(0, 5, f"- Scikit-learn : {sklearn.__version__}", 0, 1)
except Exception:
pdf.cell(0, 5, "- Scikit-learn : Installé", 0, 1)
# Try to get statsmodels version safely
try:
pdf.cell(0, 5, f"- Statsmodels : {statsmodels.__version__}", 0, 1)
except Exception:
pdf.cell(0, 5, "- Statsmodels : Installé", 0, 1)
pdf.cell(0, 5, f"- Random Seed : 42 (Fixed)", 0, 1)
# Generate PDF bytes using BytesIO
pdf_buffer = BytesIO()
pdf.output(pdf_buffer)
return pdf_buffer.getvalue()
except Exception as e:
# Return error as PDF with message using BytesIO
error_pdf = FPDF()
error_pdf.add_page()
error_pdf.set_font('Arial', 'B', 16)
error_pdf.cell(0, 10, f"Erreur lors de la generation du PDF", 0, 1)
error_pdf.ln(10)
error_pdf.set_font('Arial', '', 12)
error_pdf.multi_cell(0, 10, f"Erreur: {str(e)}")
error_buffer = BytesIO()
error_pdf.output(error_buffer)
return error_buffer.getvalue()

View File

@@ -0,0 +1,430 @@
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import PolynomialFeatures
from scipy import stats
from typing import List, Dict, Any, Tuple
import sympy as sp
def calculate_correlation_matrix(
df: pd.DataFrame,
columns: List[str],
method: str = 'pearson',
min_threshold: float = None,
include_pvalues: bool = True
) -> Dict[str, Any]:
"""
Calculate correlation matrix with optional p-values and filtering.
Args:
df: Input DataFrame
columns: List of column names to analyze
method: Correlation method ('pearson', 'spearman', 'kendall')
min_threshold: Minimum absolute correlation value to include (optional)
include_pvalues: Whether to calculate statistical significance
Returns:
Dictionary with matrix data, p-values, and metadata
"""
if not columns:
return {"matrix": [], "pvalues": [], "metadata": {}}
# Convert to numeric and handle missing values
numeric_df = df[columns].apply(pd.to_numeric, errors='coerce')
# Remove columns with too many missing values (>50%)
missing_ratios = numeric_df.isnull().sum() / len(numeric_df)
valid_cols = missing_ratios[missing_ratios <= 0.5].index.tolist()
if len(valid_cols) < 2:
return {"matrix": [], "pvalues": [], "metadata": {"error": "Need at least 2 valid numeric columns"}}
# Use pairwise deletion for correlation (more robust than listwise)
clean_df = numeric_df[valid_cols]
# Calculate correlation matrix
corr_matrix = clean_df.corr(method=method)
# Calculate p-values if requested
pvalue_matrix = None
if include_pvalues:
pvalue_matrix = pd.DataFrame(np.zeros_like(corr_matrix),
index=corr_matrix.index,
columns=corr_matrix.columns)
for i, col1 in enumerate(corr_matrix.columns):
for j, col2 in enumerate(corr_matrix.index):
if i != j:
# Pairwise complete observations
valid_data = clean_df[[col1, col2]].dropna()
if len(valid_data) >= 3:
if method == 'pearson':
_, pval = stats.pearsonr(valid_data.iloc[:, 0], valid_data.iloc[:, 1])
elif method == 'spearman':
_, pval = stats.spearmanr(valid_data.iloc[:, 0], valid_data.iloc[:, 1])
elif method == 'kendall':
_, pval = stats.kendalltau(valid_data.iloc[:, 0], valid_data.iloc[:, 1])
else:
pval = np.nan
pvalue_matrix.iloc[i, j] = pval
# Build results
results = []
pvalue_results = []
for x in corr_matrix.columns:
for y in corr_matrix.index:
value = float(corr_matrix.at[y, x])
# Apply threshold filter if specified
if min_threshold is not None and abs(value) < min_threshold:
continue
results.append({
"x": x,
"y": y,
"value": value,
"abs_value": abs(value)
})
if include_pvalues and pvalue_matrix is not None:
pvalue_results.append({
"x": x,
"y": y,
"pvalue": float(pvalue_matrix.at[y, x]) if not pd.isna(pvalue_matrix.at[y, x]) else None,
"significant": bool((pvalue_matrix.at[y, x] or 1) < 0.05 if not pd.isna(pvalue_matrix.at[y, x]) else False)
})
# Calculate summary statistics
n_observations = len(clean_df)
return {
"matrix": results,
"pvalues": pvalue_results if include_pvalues else [],
"metadata": {
"method": method,
"n_observations": n_observations,
"n_variables": len(valid_cols),
"columns_analyzed": valid_cols,
"threshold_applied": min_threshold
}
}
def get_correlation_summary(correlation_data: Dict[str, Any]) -> Dict[str, Any]:
"""
Generate summary statistics from correlation data.
Identifies strongest correlations (positive and negative).
"""
matrix = correlation_data.get("matrix", [])
# Filter out diagonal (self-correlation)
off_diagonal = [m for m in matrix if m["x"] != m["y"]]
if not off_diagonal:
return {"strongest": [], "weakest": []}
# Sort by absolute correlation value
sorted_by_abs = sorted(off_diagonal, key=lambda x: x["abs_value"], reverse=True)
# Get strongest correlations (top 5)
strongest = sorted_by_abs[:5]
# Get weakest correlations (bottom 5, but non-zero)
weakest = [m for m in sorted_by_abs if m["abs_value"] > 0][-5:]
weakest = sorted(weakest, key=lambda x: x["abs_value"])
return {
"strongest": strongest,
"weakest": weakest,
"total_pairs": len(off_diagonal)
}
def calculate_feature_importance(df: pd.DataFrame, features: List[str], target: str) -> List[Dict[str, Any]]:
if not features or not target: return []
df_clean = df.dropna(subset=[target])
X = df_clean[features].apply(pd.to_numeric, errors='coerce').fillna(0)
y = df_clean[target]
if y.dtype == 'object' or y.dtype == 'string': y = pd.factorize(y)[0]
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)
result = permutation_importance(model, X, y, n_repeats=10, random_state=42, n_jobs=-1)
importances = result.importances_mean
results = [{"feature": name, "score": max(0, float(score))} for name, score in zip(features, importances)]
total = sum(r["score"] for r in results)
if total > 0:
for r in results: r["score"] /= total
return sorted(results, key=lambda x: x["score"], reverse=True)
def generate_equations(coefficients: Dict[str, float], model_type: str) -> Dict[str, str]:
"""
Generate equation strings in LaTeX, Python, and Excel formats.
Args:
coefficients: Dictionary of feature names to coefficient values
model_type: Type of regression model ('linear', 'polynomial', 'exponential', 'logistic')
Returns:
Dictionary with 'latex', 'python', and 'excel' equation strings
"""
from sympy import symbols, sympify, latex, Float, preorder_traversal, Mul, Pow
# Extract intercept
intercept = 0.0
feature_coefs = {}
for key, value in coefficients.items():
if key in ['const', 'intercept', '(Intercept)']:
intercept = float(value)
else:
feature_coefs[key] = float(value)
# Helper function to format number cleanly for Python/Excel
def format_number(num: float) -> str:
"""Format number with 3 decimal places max"""
if num == 0:
return "0"
abs_num = abs(num)
# Use scientific notation for very small or very large numbers
if abs_num >= 10000 or (abs_num < 0.001 and abs_num > 0):
return f"{num:.2e}"
# Regular decimal with 3 decimal places max
formatted = f"{num:.3f}"
# Remove trailing zeros
return formatted.rstrip('0').rstrip('.')
# Build LaTeX with sympy using scientific notation
# Create symbols for each variable
for name in feature_coefs.keys():
safe_name = name.replace(' ', '_').replace('^', '_pow_')
symbols(safe_name)
# Build expression string
expr_parts = []
intercept_str = f"{intercept:.10f}"
expr_parts.append(intercept_str)
for name, coef in feature_coefs.items():
safe_name = name.replace(' ', '_').replace('^', '_pow_')
coef_str = f"{coef:.10f}"
expr_parts.append(f"{coef_str}*{safe_name}")
expr_str = " + ".join(expr_parts)
expr = sympify(expr_str)
# Scientific notation rounding function
def scientific_round_expr(e, ndigits=2):
"""
Convert floats to scientific notation with specified decimal places.
Example: 12345.678 -> 1.23 × 10^4
"""
repl = {}
for node in preorder_traversal(e):
if isinstance(node, Float):
val = float(node.evalf(6)) # Get enough precision
abs_val = abs(val)
# Use scientific notation for large or small numbers
if abs_val >= 10000 or (abs_val < 0.01 and abs_val > 0):
sci_str = f"{val:.{ndigits}e}"
mantissa, exponent = sci_str.split('e')
# Reconstruct as: mantissa × 10^exponent
repl[node] = Mul(Float(mantissa), Pow(10, int(exponent)), evaluate=False)
else:
# Regular rounding for normal numbers
repl[node] = Float(round(val, ndigits))
return e.xreplace(repl)
# Apply scientific rounding
expr_sci = scientific_round_expr(expr, 2)
# Convert to LaTeX
latex_eq_raw = latex(expr_sci, fold_frac_powers=True, fold_short_frac=True, mul_symbol='times')
# Replace safe names with readable display names
for name in feature_coefs.keys():
safe_name = name.replace(' ', '_').replace('^', '_pow_')
display_name = name.replace('_', ' ')
latex_eq_raw = latex_eq_raw.replace(safe_name, f"\\mathrm{{{display_name}}}")
# Add "y = " prefix
latex_eq = f"y = {latex_eq_raw}"
# Build Python format
python_parts = []
for name, coef in feature_coefs.items():
coef_str = format_number(coef)
if coef >= 0:
python_parts.append(f"+ {coef_str}*{name}")
else:
python_parts.append(f"- {format_number(abs(coef))}*{name}")
intercept_str_clean = format_number(intercept)
python_eq = f"y = {intercept_str_clean} " + ' '.join(python_parts) if python_parts else f"y = {intercept_str_clean}"
# Generate Excel format
col_letters = {name: chr(65 + i) for i, name in enumerate(feature_coefs.keys())}
excel_parts = []
for name, coef in feature_coefs.items():
coef_str = format_number(coef)
col_letter = col_letters[name]
if coef >= 0:
excel_parts.append(f"+ {coef_str}*{col_letter}1")
else:
excel_parts.append(f"- {format_number(abs(coef))}*{col_letter}1")
excel_eq = f"={intercept_str_clean} " + ' '.join(excel_parts) if excel_parts else f"={intercept_str_clean}"
return {
"latex": latex_eq,
"python": python_eq,
"excel": excel_eq
}
def run_regression_analysis(df: pd.DataFrame, x_cols: List[str], y_col: str, model_type: str = "linear", poly_degree: int = 1, include_interactions: bool = False) -> Dict[str, Any]:
# 1. Prep Data
# Capture original X for plotting before transformation
X_original = df[x_cols].apply(pd.to_numeric, errors='coerce')
y_data = df[y_col]
# Align indices after dropna
data = pd.concat([X_original, y_data], axis=1).dropna()
if data.empty or len(data) < len(x_cols) + 1:
raise ValueError("Insufficient data.")
X_raw = data[x_cols] # Keep for plotting
y = pd.to_numeric(data[y_col], errors='coerce')
X = X_raw.copy() # Start with raw for modelling
# 2. Advanced Feature Engineering
if model_type == "polynomial" or include_interactions:
degree = poly_degree if model_type == "polynomial" else 2
interaction_only = include_interactions and model_type != "polynomial"
poly = PolynomialFeatures(degree=degree, interaction_only=interaction_only, include_bias=False)
X_poly = poly.fit_transform(X)
poly_cols = poly.get_feature_names_out(X.columns)
X = pd.DataFrame(X_poly, columns=poly_cols, index=X.index)
# 3. Model Fitting
try:
model = None
y_pred = None
if model_type == "logistic":
X_const = sm.add_constant(X)
y_bin = (y > y.median()).astype(int)
model = sm.Logit(y_bin, X_const).fit(disp=0)
y_pred = model.predict(X_const)
y = y_bin
elif model_type == "exponential":
if (y <= 0).any(): raise ValueError("Exponential regression requires Y > 0.")
y_log = np.log(y)
X_const = sm.add_constant(X)
lin_model = sm.OLS(y_log, X_const).fit()
y_pred = np.exp(lin_model.predict(X_const))
model = lin_model
else: # Linear or Polynomial
X_const = sm.add_constant(X)
model = sm.OLS(y, X_const).fit()
y_pred = model.predict(X_const)
# 4. Construct Visualization Data
# Create fit plots for each original feature
fit_plots_by_feature = {}
residuals_vs_fitted = []
y_list = y.tolist()
pred_list = y_pred.tolist()
residuals = []
# Create a fit plot for each original feature
for feature_name in X_raw.columns:
x_feature_list = X_raw[feature_name].tolist()
feature_plot = []
for i in range(len(y_list)):
feature_plot.append({
"x": float(x_feature_list[i]),
"real": float(y_list[i]),
"pred": float(pred_list[i])
})
# Sort by X for proper curve rendering
feature_plot.sort(key=lambda item: item["x"])
fit_plots_by_feature[feature_name] = feature_plot
# Also create a single fit_plot using the first feature for backward compatibility
fit_plot = fit_plots_by_feature[X_raw.columns[0]] if len(X_raw.columns) > 0 else []
# Residuals plot
for i in range(len(y_list)):
res_val = y_list[i] - pred_list[i]
residuals.append(res_val)
residuals_vs_fitted.append({
"fitted": float(pred_list[i]),
"residual": res_val
})
# 5. Calculate Partial Regression Plots (Added Variable Plots)
# These show the isolated effect of each variable controlling for others
partial_regression_plots = {}
# Only calculate for multiple regression (more than 1 feature)
if len(X_raw.columns) > 1:
for feature_name in X_raw.columns:
# Get other features (all except current)
other_features = [col for col in X_raw.columns if col != feature_name]
if len(other_features) == 0:
continue
# Step 1: Regress Y on all features except current one
X_other = X_raw[other_features]
X_other_const = sm.add_constant(X_other)
model_y = sm.OLS(y, X_other_const).fit()
y_residuals = y - model_y.predict(X_other_const)
# Step 2: Regress current feature on other features
model_x = sm.OLS(X_raw[feature_name], X_other_const).fit()
x_residuals = X_raw[feature_name] - model_x.predict(X_other_const)
# Step 3: Create partial plot data
partial_plot = []
for i in range(len(y)):
partial_plot.append({
"x": float(x_residuals.iloc[i]),
"y": float(y_residuals.iloc[i])
})
# Sort by x for proper line rendering
partial_plot.sort(key=lambda item: item["x"])
partial_regression_plots[feature_name] = partial_plot
# Generate equation strings
equations = generate_equations(model.params.to_dict(), model_type)
summary = {
"r_squared": float(model.rsquared) if hasattr(model, 'rsquared') else float(model.prsquared),
"adj_r_squared": float(model.rsquared_adj) if hasattr(model, 'rsquared_adj') else None,
"aic": float(model.aic),
"bic": float(model.bic),
"coefficients": model.params.to_dict(),
"p_values": model.pvalues.to_dict(),
"std_errors": model.bse.to_dict(),
"sample_size": int(model.nobs),
"residuals": residuals,
"fit_plot": fit_plot, # Backward compatibility (first feature)
"fit_plots_by_feature": fit_plots_by_feature, # All features
"partial_regression_plots": partial_regression_plots, # Partial plots for multivariate
"diagnostic_plot": residuals_vs_fitted,
"equations": equations # LaTeX, Python, Excel formats
}
return summary
except Exception as e:
raise ValueError(f"Model calculation failed: {str(e)}")

View File

@@ -0,0 +1,294 @@
#!/usr/bin/env python3
"""
Générateur de données de test pour détecter les outliers UNIVARIÉS et MULTIVARIÉS.
Ce script crée un dataset avec :
1. Des outliers univariés évidents (valeurs extrêmes dans une seule colonne)
2. Des outliers multivariés (combinaisons de valeurs normales individuellement mais anormales ensemble)
3. Des données normales pour la majorité
"""
import pandas as pd
import numpy as np
from pathlib import Path
# Configuration
np.random.seed(42)
N_NORMAL = 100 # Nombre de lignes normales
def generate_outlier_dataset():
"""
Génère un dataset avec des outliers contrôlés pour tester les deux types de détection.
"""
data = []
# ========================================================================
# 1. DONNÉES NORMALES (baseline)
# ========================================================================
print("📊 Génération des données normales...")
for i in range(N_NORMAL):
data.append({
"ID": i + 1,
"Age": np.random.normal(40, 10), # Moyenne 40, écart-type 10
"Salaire": np.random.normal(35000, 8000), # Moyenne 35k, écart-type 8k
"Experience": np.random.normal(10, 4), # Moyenne 10 ans, écart-type 4
"Performance": np.random.normal(75, 10), # Moyenne 75/100, écart-type 10
"Heures_Sup": np.random.normal(5, 3), # Moyenne 5h/mois, écart-type 3
"Type": "Normal"
})
# ========================================================================
# 2. OUTLIERS UNIVARIÉS (évidents dans UNE colonne)
# ========================================================================
print("🔴 Génération des outliers univariés...")
# Outlier 1: Âge extrême (150 ans - impossible)
data.append({
"ID": 101,
"Age": 150,
"Salaire": 38000,
"Experience": 12,
"Performance": 78,
"Heures_Sup": 6,
"Type": "Outlier_Uni_Age"
})
# Outlier 2: Salaire extrêmement élevé (500k - 10x la normale)
data.append({
"ID": 102,
"Age": 45,
"Salaire": 500000,
"Experience": 15,
"Performance": 82,
"Heures_Sup": 8,
"Type": "Outlier_Uni_Salaire"
})
# Outlier 3: Salaire négatif (impossible)
data.append({
"ID": 103,
"Age": 35,
"Salaire": -5000,
"Experience": 8,
"Performance": 72,
"Heures_Sup": 4,
"Type": "Outlier_Uni_Salaire_Neg"
})
# Outlier 4: Performance > 100 (impossible)
data.append({
"ID": 104,
"Age": 38,
"Salaire": 42000,
"Experience": 11,
"Performance": 150,
"Heures_Sup": 7,
"Type": "Outlier_Uni_Perf"
})
# Outlier 5: Heures supplémentaires négatives
data.append({
"ID": 105,
"Age": 42,
"Salaire": 36000,
"Experience": 13,
"Performance": 76,
"Heures_Sup": -20,
"Type": "Outlier_Uni_Heures"
})
# ========================================================================
# 3. OUTLIERS MULTIVARIÉS (normaux individuellement, anormaux ensemble)
# ========================================================================
print("🟣 Génération des outliers multivariés...")
# Outlier Multivarié 1: Jeune avec BEAUCUP d'expérience (impossible)
# Age=25 (normal) mais Experience=30 (impossible pour cet âge)
data.append({
"ID": 201,
"Age": 25,
"Salaire": 32000,
"Experience": 30,
"Performance": 70,
"Heures_Sup": 5,
"Type": "Outlier_Multi_Age_Exp"
})
# Outlier Multivarié 2: Haut salaire avec basse performance (suspect)
# Salaire=80k (normal possible) mais Performance=40 (anormalement bas pour ce salaire)
data.append({
"ID": 202,
"Age": 45,
"Salaire": 80000,
"Experience": 15,
"Performance": 40,
"Heures_Sup": 2,
"Type": "Outlier_Multi_Salaire_Perf"
})
# Outlier Multivarié 3: Faible expérience avec très haut salaire (suspect)
data.append({
"ID": 203,
"Age": 28,
"Salaire": 95000,
"Experience": 1,
"Performance": 85,
"Heures_Sup": 15,
"Type": "Outlier_Multi_Exp_Salaire"
})
# Outlier Multivarié 4: Personne âgée avec junior-level tout
data.append({
"ID": 204,
"Age": 65,
"Salaire": 25000,
"Experience": 1,
"Performance": 60,
"Heures_Sup": 0,
"Type": "Outlier_Multi_Senior_Junior"
})
# Outlier Multivarié 5: Performance parfaite avec 0 heures supp (rare)
data.append({
"ID": 205,
"Age": 35,
"Salaire": 40000,
"Experience": 10,
"Performance": 100,
"Heures_Sup": 0,
"Type": "Outlier_Multi_Perf_Heures"
})
# Outlier Multivarié 6: Combinaison impossible - Junior avec salaire senior ET perf max
data.append({
"ID": 206,
"Age": 22,
"Salaire": 85000,
"Experience": 0,
"Performance": 95,
"Heures_Sup": 0,
"Type": "Outlier_Multi_Impossible"
})
# ========================================================================
# 4. CAS LIMITES (valeurs frontières)
# ========================================================================
print("🎯 Génération des cas limites...")
# Cas limite 1: Zéro partout (suspect mais pas impossible)
data.append({
"ID": 301,
"Age": 22,
"Salaire": 0,
"Experience": 0,
"Performance": 0,
"Heures_Sup": 0,
"Type": "Cas_Limie_Zeros"
})
# Cas limite 2: Très âgé avec beaucoup d'expérience (normal)
data.append({
"ID": 302,
"Age": 62,
"Salaire": 70000,
"Experience": 40,
"Performance": 88,
"Heures_Sup": 3,
"Type": "Normal_Senior"
})
# Cas limite 3: Salaire minimum légal (normal)
data.append({
"ID": 303,
"Age": 25,
"Salaire": 15000,
"Experience": 2,
"Performance": 65,
"Heures_Sup": 2,
"Type": "Normal_Salaire_Min"
})
# Création du DataFrame
df = pd.DataFrame(data)
# Arrondir les valeurs numériques pour plus de clarté
numeric_cols = ["Age", "Salaire", "Experience", "Performance", "Heures_Sup"]
for col in numeric_cols:
df[col] = df[col].round(2)
# Réordonner les colonnes
df = df[["ID", "Age", "Experience", "Salaire", "Performance", "Heures_Sup", "Type"]]
return df
def main():
print("=" * 70)
print("🧪 GÉNÉRATEUR DE DONNÉES DE TEST - OUTLIERS UNIVARIÉS & MULTIVARIÉS")
print("=" * 70)
print()
# Générer le dataset
df = generate_outlier_dataset()
# Sauvegarder en CSV
output_dir = Path("/home/sepehr/dev/Data_analysis/backend/test_data")
output_dir.mkdir(exist_ok=True)
csv_path = output_dir / "test_outliers_complete.csv"
df.to_csv(csv_path, index=False)
excel_path = output_dir / "test_outliers_complete.xlsx"
df.to_excel(excel_path, index=False)
# Afficher les statistiques
print()
print("=" * 70)
print("📊 STATISTIQUES DU DATASET")
print("=" * 70)
print(f"✅ Total lignes : {len(df)}")
print(f"📈 Colonnes : {len(df.columns)}")
print()
print("🔴 Outliers univariés attendus : 5")
print(" - ID 101: Âge = 150 ans")
print(" - ID 102: Salaire = 500,000€")
print(" - ID 103: Salaire = -5,000€ (négatif)")
print(" - ID 104: Performance = 150 (>100)")
print(" - ID 105: Heures_Sup = -20 (négatif)")
print()
print("🟣 Outliers multivariés attendus : 6")
print(" - ID 201: Âge=25 avec Exp=30 (impossible)")
print(" - ID 202: Salaire=80k avec Perf=40 (incohérent)")
print(" - ID 203: Exp=1 avec Salaire=95k (suspect)")
print(" - ID 204: Âge=65 avec Exp=1 (incohérent)")
print(" - ID 205: Perf=100 avec Heures_Sup=0 (rare)")
print(" - ID 206: Âge=22, Exp=0, Salaire=85k (impossible)")
print()
print("=" * 70)
print("💾 FICHIERS GÉNÉRÉS")
print("=" * 70)
print(f"📄 CSV : {csv_path}")
print(f"📊 Excel : {excel_path}")
print()
print("=" * 70)
print("🎯 COMMENT TESTER")
print("=" * 70)
print("1. Importez le fichier 'test_outliers_complete.csv' dans l'application")
print("2. Vérifiez que les colonnes sont bien détectées comme numériques")
print("3. Les cercles ROUGES doivent apparaître sur les colonnes avec outliers univariés")
print("4. Le cercle VIOLET doit apparaître (indicateur global multivarié)")
print("5. Cliquez sur chaque indicateur pour voir les détails")
print("6. Vérifiez la cohérence des outliers détectés")
print()
print("✨ Bon testing !")
print("=" * 70)
# Afficher un aperçu des outliers
print()
print("📋 APERÇU DES OUTLIERS DANS LE DATASET :")
print("-" * 70)
outliers_df = df[df["Type"].str.contains("Outlier", case=False, na=False)]
print(outliers_df[["ID", "Type", "Age", "Experience", "Salaire", "Performance", "Heures_Sup"]].to_string(index=False))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,266 @@
"""
Générateur de données de test pour l'outil d'analyse de données.
Crée des fichiers CSV et XLSX avec des corrélations et relations polynomiales.
"""
import pandas as pd
import numpy as np
from pathlib import Path
# Configuration
np.random.seed(42)
n_samples = 500
output_dir = Path(__file__).parent.parent / "test_data"
output_dir.mkdir(exist_ok=True)
print(f"📁 Génération des données de test dans: {output_dir}")
# ============================================================================
# 1. Dataset Ventes & Marketing (corrélations multiples, relations linéaires)
# ============================================================================
print("\n📊 Dataset: Ventes & Marketing")
ventes_data = {
# Budget marketing mensuel (€)
'budget_marketing': np.random.uniform(1000, 50000, n_samples),
# Heures de publicité TV
'tv_ads_hours': np.random.uniform(5, 100, n_samples),
# Nombre de posts sur réseaux sociaux
'social_media_posts': np.random.randint(10, 200, n_samples),
# Prix moyen du produit (€) - légère corrélation négative avec ventes
'prix_moyen': np.random.uniform(20, 100, n_samples),
# Temperature moyenne (°C) - effet saisonnier
'temperature': np.random.normal(20, 8, n_samples),
# Concurrence (index 1-10)
'concurrence': np.random.randint(1, 11, n_samples),
}
df_ventes = pd.DataFrame(ventes_data)
# Créer les ventes avec des relations réalistes
# Ventes de base
ventes_base = 5000
# Effet budget marketing (fortement positif)
effet_budget = df_ventes['budget_marketing'] * 0.8
# Effet TV ads (modéré)
effet_tv = df_ventes['tv_ads_hours'] * 25
# Effet social media (faible mais positif)
effet_social = df_ventes['social_media_posts'] * 8
# Effet prix (négatif - prix plus élevé = moins de ventes)
effet_prix = -df_ventes['prix_moyen'] * 15
# Effet température (positif - meilleur temps = plus de ventes)
effet_temp = df_ventes['temperature'] * 30
# Effet concurrence (négatif)
effet_concurrence = -df_ventes['concurrence'] * 200
# Bruit aléatoire
bruit = np.random.normal(0, 500, n_samples)
# Ventes totales
df_ventes['ventes'] = (ventes_base + effet_budget + effet_tv + effet_social +
effet_prix + effet_temp + effet_concurrence + bruit)
# Arrondir
df_ventes['ventes'] = df_ventes['ventes'].round(2)
# Sauvegarder
ventes_csv = output_dir / "ventes_marketing.csv"
ventes_excel = output_dir / "ventes_marketing.xlsx"
df_ventes.to_csv(ventes_csv, index=False)
df_ventes.to_excel(ventes_excel, index=False)
print(f"{ventes_csv.name}: {n_samples} lignes, {len(df_ventes.columns)} colonnes")
print(f"{ventes_excel.name}: {n_samples} lignes, {len(df_ventes.columns)} colonnes")
# ============================================================================
# 2. Dataset Production Industriel (relation polynomiale / quadratique)
# ============================================================================
print("\n🏭 Dataset: Production Industrielle")
production_data = {
# Température machine (°C)
'temperature_machine': np.random.uniform(150, 250, n_samples),
# Pression (bar)
'pression': np.random.uniform(2, 10, n_samples),
# Vitesse conveyor (m/min)
'vitesse_conveyor': np.random.uniform(50, 150, n_samples),
# Humidité (%)
'humidite': np.random.uniform(30, 70, n_samples),
# Qualité matière première (index 1-100)
'qualite_matiere': np.random.uniform(60, 100, n_samples),
}
df_production = pd.DataFrame(production_data)
# Relation polynomiale: température optimale ~200°C
# Efficacité = a*(T - Toptimal)^2 + b
T_optimal = 200
df_production['efficacite_production'] = (
-0.08 * (df_production['temperature_machine'] - T_optimal) ** 2
+ 95 # Efficacité maximale
+ df_production['pression'] * 1.5
+ df_production['vitesse_conveyor'] * 0.1
+ df_production['qualite_matiere'] * 0.3
+ np.random.normal(0, 3, n_samples)
)
# Borner entre 0 et 100
df_production['efficacite_production'] = df_production['efficacite_production'].clip(0, 100).round(2)
# Défauts de production (relation inverse avec l'efficacité)
df_production['defauts'] = (
100 - df_production['efficacite_production']
) * 0.5 + np.random.normal(0, 1, n_samples)
df_production['defauts'] = df_production['defauts'].clip(0, None).round(2)
# Sauvegarder
production_csv = output_dir / "production_industrielle.csv"
production_excel = output_dir / "production_industrielle.xlsx"
df_production.to_csv(production_csv, index=False)
df_production.to_excel(production_excel, index=False)
print(f"{production_csv.name}: {n_samples} lignes, {len(df_production.columns)} colonnes")
print(f"{production_excel.name}: {n_samples} lignes, {len(df_production.columns)} colonnes")
# ============================================================================
# 3. Dataset Santé & Fitness (relations mixtes)
# ============================================================================
print("\n🏃 Dataset: Santé & Fitness")
sante_data = {
# Âge (années)
'age': np.random.randint(18, 80, n_samples),
# Poids (kg)
'poids': np.random.normal(75, 15, n_samples),
# Taille (cm)
'taille': np.random.normal(170, 10, n_samples),
# Heures d'exercice par semaine
'heures_exercice': np.random.uniform(0, 15, n_samples),
# Calories consommées par jour
'calories_jour': np.random.normal(2200, 400, n_samples),
# Heures de sommeil
'heures_sommeil': np.random.normal(7, 1.5, n_samples),
# Niveau de stress (1-10)
'stress': np.random.randint(1, 11, n_samples),
}
df_sante = pd.DataFrame(sante_data)
# IMC (BMI)
df_sante['imc'] = (df_sante['poids'] / (df_sante['taille'] / 100) ** 2).round(2)
# Métabolisme de base (formule de Harris-Benedict simplifiée)
# Hommes: BMR = 88.362 + (13.397 × kg) + (4.799 × cm) - (5.677 × age)
df_sante['metabolisme_base'] = (
88.362
+ 13.397 * df_sante['poids']
+ 4.799 * df_sante['taille']
- 5.677 * df_sante['age']
+ np.random.normal(0, 50, n_samples)
).round(2)
# Niveau d'énergie (subjectif 1-10)
df_sante['niveau_energie'] = (
5
+ 0.3 * df_sante['heures_exercice']
- 0.2 * df_sante['stress']
+ 0.15 * df_sante['heures_sommeil']
- 0.01 * (df_sante['age'] - 30)
+ np.random.normal(0, 1, n_samples)
).clip(1, 10).round(2)
# Sauvegarder
sante_csv = output_dir / "sante_fitness.csv"
sante_excel = output_dir / "sante_fitness.xlsx"
df_sante.to_csv(sante_csv, index=False)
df_sante.to_excel(sante_excel, index=False)
print(f"{sante_csv.name}: {n_samples} lignes, {len(df_sante.columns)} colonnes")
print(f"{sante_excel.name}: {n_samples} lignes, {len(df_sante.columns)} colonnes")
# ============================================================================
# 4. Dataset Finance (relation exponentielle)
# ============================================================================
print("\n💰 Dataset: Finance & Investissement")
finance_data = {
# Montant investi (€)
'montant_investi': np.random.uniform(1000, 100000, n_samples),
# Durée investissement (années)
'duree_annees': np.random.uniform(1, 30, n_samples),
# Taux de rendement annuel (%)
'taux_rendement': np.random.uniform(2, 15, n_samples),
# Risque (1-10)
'niveau_risque': np.random.randint(1, 11, n_samples),
}
df_finance = pd.DataFrame(finance_data)
# Valeur finale avec intérêts composés: A = P(1 + r)^t
df_finance['valeur_finale'] = (
df_finance['montant_investi'] *
(1 + df_finance['taux_rendement'] / 100) ** df_finance['duree_annees']
* (1 - 0.02 * df_finance['niveau_risque']) # Pénalité de risque
+ np.random.normal(0, df_finance['montant_investi'] * 0.01, n_samples)
).round(2)
# Profit
df_finance['profit'] = (df_finance['valeur_finale'] - df_finance['montant_investi']).round(2)
# Rendement total (%)
df_finance['rendement_total'] = (df_finance['profit'] / df_finance['montant_investi'] * 100).round(2)
# Sauvegarder
finance_csv = output_dir / "finance_investissement.csv"
finance_excel = output_dir / "finance_investissement.xlsx"
df_finance.to_csv(finance_csv, index=False)
df_finance.to_excel(finance_excel, index=False)
print(f"{finance_csv.name}: {n_samples} lignes, {len(df_finance.columns)} colonnes")
print(f"{finance_excel.name}: {n_samples} lignes, {len(df_finance.columns)} colonnes")
# ============================================================================
# Résumé
# ============================================================================
print("\n" + "="*60)
print("✅ Tous les fichiers de test ont été générés avec succès !")
print("="*60)
print(f"\n📂 Répertoire: {output_dir}")
print("\n📋 Fichiers créés:")
print(" 1. ventes_marketing.csv/xlsx - Corrélations multiples linéaires")
print(" 2. production_industrielle.csv/xlsx - Relation polynomiale (quadratique)")
print(" 3. sante_fitness.csv/xlsx - Relations mixtes + IMC calculé")
print(" 4. finance_investissement.csv/xlsx - Relation exponentielle")
print("\n💡 Utilisez ces fichiers pour tester:")
print(" • Analyse de corrélation")
print(" • Régression linéaire, polynomiale, exponentielle")
print(" • Import CSV et Excel")
print("="*60)

32
backend/main.py Normal file
View File

@@ -0,0 +1,32 @@
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from app.api.v1.upload import router as upload_router
from app.api.v1.analysis import router as analysis_router
from app.api.v1.reports import router as reports_router
app = FastAPI(title="Data_analysis API", version="0.1.0")
# CORS configuration
origins = ["*"] # Allow all origins for dev/homelab simplicity
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
expose_headers=["X-Column-Metadata", "Content-Disposition"]
)
# Register routers
app.include_router(upload_router, prefix="/api/v1")
app.include_router(analysis_router, prefix="/api/v1")
app.include_router(reports_router, prefix="/api/v1")
@app.get("/health")
def health_check():
return {"status": "ok", "service": "backend"}
@app.get("/")
def read_root():
return {"message": "Welcome to Data_analysis API"}

22
backend/pyproject.toml Normal file
View File

@@ -0,0 +1,22 @@
[project]
name = "backend"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"fastapi>=0.128.0",
"fpdf2>=2.8.5",
"httpx>=0.28.1",
"matplotlib>=3.10.0",
"openpyxl>=3.1.5",
"pandas>=2.3.3",
"pyarrow>=22.0.0",
"pydantic>=2.12.5",
"pytest>=9.0.2",
"python-multipart>=0.0.21",
"scikit-learn>=1.8.0",
"statsmodels>=0.14.6",
"sympy>=1.13.0",
"uvicorn>=0.40.0",
]

BIN
backend/test.pdf Normal file

Binary file not shown.

View File

@@ -0,0 +1,115 @@
ID,Age,Experience,Salaire,Performance,Heures_Sup,Type
1,44.97,12.59,33893.89,90.23,4.3,Normal
2,37.66,13.07,47633.7,70.31,6.63,Normal
3,35.37,10.97,31274.16,55.87,-0.17,Normal
4,34.38,11.26,26897.35,65.92,0.76,Normal
5,54.66,10.27,33193.79,60.75,3.37,Normal
6,41.11,11.5,25792.05,68.99,4.12,Normal
7,33.98,9.95,49818.23,64.42,7.47,Normal
8,27.79,2.16,36670.91,61.72,5.59,Normal
9,47.38,9.54,36370.95,71.99,0.56,Normal
10,32.8,14.23,31314.89,78.44,-0.29,Normal
11,43.24,7.29,31919.34,81.12,8.09,Normal
12,49.31,8.76,28286.26,78.31,7.93,Normal
13,35.21,5.57,33514.73,63.04,7.44,Normal
14,53.56,14.01,34423.92,78.62,3.06,Normal
15,43.61,9.86,47304.29,90.65,-2.86,Normal
16,48.22,8.8,35696.38,75.92,-0.96,Normal
17,37.8,15.91,37856.9,69.82,2.57,Normal
18,34.98,11.32,42323.22,69.7,6.54,Normal
19,40.97,7.19,42749.16,71.72,3.82,Normal
20,25.36,11.04,37368.96,75.05,4.3,Normal
21,25.85,8.63,31634.84,66.98,4.52,Normal
22,44.04,10.7,50089.49,77.58,4.78,Normal
23,20.81,10.24,34787.89,99.63,4.42,Normal
24,43.02,5.33,34722.31,86.43,7.26,Normal
25,47.91,15.61,27724.9,60.98,6.76,Normal
26,61.9,7.73,27075.71,76.0,3.49,Normal
27,24.49,5.75,35548.5,79.74,2.24,Normal
28,55.5,8.71,28733.97,83.14,1.31,Normal
29,42.27,3.57,45457.14,76.85,5.78,Normal
30,47.82,4.72,25104.39,80.22,5.89,Normal
31,42.5,7.28,37771.59,77.32,5.88,Normal
32,32.86,11.9,49926.2,63.09,6.97,Normal
33,30.25,14.63,41296.68,66.79,7.89,Normal
34,44.13,17.59,41576.48,72.55,2.74,Normal
35,31.1,9.69,28473.52,78.41,5.83,Normal
36,48.27,15.81,35104.02,72.35,13.16,Normal
37,46.26,5.72,28142.74,79.82,4.33,Normal
38,47.14,9.71,38785.9,66.53,0.46,Normal
39,35.53,10.86,41851.19,62.54,5.52,Normal
40,43.85,10.61,27929.14,75.58,1.57,Normal
41,43.58,14.33,39486.28,85.54,0.87,Normal
42,30.62,12.06,39120.28,80.15,16.56,Normal
43,45.71,13.82,44084.53,81.51,4.05,Normal
44,47.59,9.05,28817.4,70.15,5.25,Normal
45,63.15,12.75,20061.88,58.87,3.58,Normal
46,50.89,5.69,35514.24,67.85,7.04,Normal
47,32.7,10.18,36731.67,68.48,11.43,Normal
48,46.34,10.75,18798.86,68.38,7.56,Normal
49,32.07,12.02,34082.11,83.66,1.4,Normal
50,36.65,7.39,31200.44,92.65,6.21,Normal
51,27.39,18.49,42342.9,85.32,0.44,Normal
52,35.16,7.17,45135.29,79.44,7.32,Normal
53,30.73,-2.97,34523.8,64.76,4.24,Normal
54,27.52,4.28,48059.29,70.6,5.39,Normal
55,54.41,14.65,23513.1,75.1,2.06,Normal
56,44.62,7.6,36592.48,75.7,3.84,Normal
57,41.14,16.34,40297.05,62.62,11.4,Normal
58,20.48,12.35,33785.72,77.81,3.13,Normal
59,37.92,7.64,31055.99,83.5,6.07,Normal
60,33.07,11.23,42196.8,83.13,6.89,Normal
61,31.71,12.99,30518.55,81.1,4.94,Normal
62,41.17,7.63,45221.32,80.47,4.39,Normal
63,37.82,13.3,43790.21,83.14,8.92,Normal
64,40.21,8.76,40455.62,78.24,4.61,Normal
65,40.97,6.73,39761.26,95.92,1.98,Normal
66,27.86,13.17,44264.89,81.24,6.89,Normal
67,39.88,10.3,27821.97,68.23,7.93,Normal
68,38.53,8.71,28396.02,79.13,3.31,Normal
69,31.78,10.98,36949.5,69.93,3.59,Normal
70,42.32,4.37,23415.33,67.82,4.36,Normal
71,43.11,13.43,46802.85,73.4,4.94,Normal
72,29.97,8.85,34851.89,78.23,2.52,Normal
73,45.19,9.56,47261.91,79.02,7.07,Normal
74,35.99,10.05,36792.74,75.98,2.68,Normal
75,40.25,15.8,38983.99,84.59,11.46,Normal
76,32.33,10.73,41978.57,96.9,2.58,Normal
77,31.6,1.5,30204.86,69.74,2.72,Normal
78,41.5,17.5,37734.05,84.5,3.27,Normal
79,31.02,4.72,38935.35,93.31,8.54,Normal
80,35.31,15.42,21294.92,73.85,8.71,Normal
81,24.06,10.02,30205.0,75.47,3.65,Normal
82,46.23,9.43,26459.04,76.2,6.54,Normal
83,47.12,3.86,26002.86,87.78,6.0,Normal
84,32.52,10.46,47409.22,86.79,5.2,Normal
85,60.61,9.0,49042.73,84.72,6.94,Normal
86,53.69,12.74,27280.61,85.58,-0.28,Normal
87,28.17,8.92,18686.14,82.18,9.51,Normal
88,40.74,4.48,48028.92,57.97,4.83,Normal
89,43.84,1.73,34738.44,74.11,1.09,Normal
90,46.7,6.24,37932.79,69.86,1.82,Normal
91,39.37,6.06,42641.14,80.04,3.41,Normal
92,32.07,5.86,34143.76,69.46,1.41,Normal
93,59.65,7.2,35282.11,77.14,4.66,Normal
94,37.79,13.03,39913.33,69.69,3.27,Normal
95,37.25,3.94,16584.63,88.67,9.93,Normal
96,37.51,11.25,39612.46,105.79,8.36,Normal
97,38.72,3.57,27355.68,77.03,2.73,Normal
98,25.78,5.67,29827.42,91.87,7.64,Normal
99,39.92,10.31,46839.55,66.39,9.57,Normal
100,45.39,9.24,26702.03,66.24,0.85,Normal
101,150.0,12.0,38000.0,78.0,6.0,Outlier_Uni_Age
102,45.0,15.0,500000.0,82.0,8.0,Outlier_Uni_Salaire
103,35.0,8.0,-5000.0,72.0,4.0,Outlier_Uni_Salaire_Neg
104,38.0,11.0,42000.0,150.0,7.0,Outlier_Uni_Perf
105,42.0,13.0,36000.0,76.0,-20.0,Outlier_Uni_Heures
201,25.0,30.0,32000.0,70.0,5.0,Outlier_Multi_Age_Exp
202,45.0,15.0,80000.0,40.0,2.0,Outlier_Multi_Salaire_Perf
203,28.0,1.0,95000.0,85.0,15.0,Outlier_Multi_Exp_Salaire
204,65.0,1.0,25000.0,60.0,0.0,Outlier_Multi_Senior_Junior
205,35.0,10.0,40000.0,100.0,0.0,Outlier_Multi_Perf_Heures
206,22.0,0.0,85000.0,95.0,0.0,Outlier_Multi_Impossible
301,22.0,0.0,0.0,0.0,0.0,Cas_Limie_Zeros
302,62.0,40.0,70000.0,88.0,3.0,Normal_Senior
303,25.0,2.0,15000.0,65.0,2.0,Normal_Salaire_Min
1 ID Age Experience Salaire Performance Heures_Sup Type
2 1 44.97 12.59 33893.89 90.23 4.3 Normal
3 2 37.66 13.07 47633.7 70.31 6.63 Normal
4 3 35.37 10.97 31274.16 55.87 -0.17 Normal
5 4 34.38 11.26 26897.35 65.92 0.76 Normal
6 5 54.66 10.27 33193.79 60.75 3.37 Normal
7 6 41.11 11.5 25792.05 68.99 4.12 Normal
8 7 33.98 9.95 49818.23 64.42 7.47 Normal
9 8 27.79 2.16 36670.91 61.72 5.59 Normal
10 9 47.38 9.54 36370.95 71.99 0.56 Normal
11 10 32.8 14.23 31314.89 78.44 -0.29 Normal
12 11 43.24 7.29 31919.34 81.12 8.09 Normal
13 12 49.31 8.76 28286.26 78.31 7.93 Normal
14 13 35.21 5.57 33514.73 63.04 7.44 Normal
15 14 53.56 14.01 34423.92 78.62 3.06 Normal
16 15 43.61 9.86 47304.29 90.65 -2.86 Normal
17 16 48.22 8.8 35696.38 75.92 -0.96 Normal
18 17 37.8 15.91 37856.9 69.82 2.57 Normal
19 18 34.98 11.32 42323.22 69.7 6.54 Normal
20 19 40.97 7.19 42749.16 71.72 3.82 Normal
21 20 25.36 11.04 37368.96 75.05 4.3 Normal
22 21 25.85 8.63 31634.84 66.98 4.52 Normal
23 22 44.04 10.7 50089.49 77.58 4.78 Normal
24 23 20.81 10.24 34787.89 99.63 4.42 Normal
25 24 43.02 5.33 34722.31 86.43 7.26 Normal
26 25 47.91 15.61 27724.9 60.98 6.76 Normal
27 26 61.9 7.73 27075.71 76.0 3.49 Normal
28 27 24.49 5.75 35548.5 79.74 2.24 Normal
29 28 55.5 8.71 28733.97 83.14 1.31 Normal
30 29 42.27 3.57 45457.14 76.85 5.78 Normal
31 30 47.82 4.72 25104.39 80.22 5.89 Normal
32 31 42.5 7.28 37771.59 77.32 5.88 Normal
33 32 32.86 11.9 49926.2 63.09 6.97 Normal
34 33 30.25 14.63 41296.68 66.79 7.89 Normal
35 34 44.13 17.59 41576.48 72.55 2.74 Normal
36 35 31.1 9.69 28473.52 78.41 5.83 Normal
37 36 48.27 15.81 35104.02 72.35 13.16 Normal
38 37 46.26 5.72 28142.74 79.82 4.33 Normal
39 38 47.14 9.71 38785.9 66.53 0.46 Normal
40 39 35.53 10.86 41851.19 62.54 5.52 Normal
41 40 43.85 10.61 27929.14 75.58 1.57 Normal
42 41 43.58 14.33 39486.28 85.54 0.87 Normal
43 42 30.62 12.06 39120.28 80.15 16.56 Normal
44 43 45.71 13.82 44084.53 81.51 4.05 Normal
45 44 47.59 9.05 28817.4 70.15 5.25 Normal
46 45 63.15 12.75 20061.88 58.87 3.58 Normal
47 46 50.89 5.69 35514.24 67.85 7.04 Normal
48 47 32.7 10.18 36731.67 68.48 11.43 Normal
49 48 46.34 10.75 18798.86 68.38 7.56 Normal
50 49 32.07 12.02 34082.11 83.66 1.4 Normal
51 50 36.65 7.39 31200.44 92.65 6.21 Normal
52 51 27.39 18.49 42342.9 85.32 0.44 Normal
53 52 35.16 7.17 45135.29 79.44 7.32 Normal
54 53 30.73 -2.97 34523.8 64.76 4.24 Normal
55 54 27.52 4.28 48059.29 70.6 5.39 Normal
56 55 54.41 14.65 23513.1 75.1 2.06 Normal
57 56 44.62 7.6 36592.48 75.7 3.84 Normal
58 57 41.14 16.34 40297.05 62.62 11.4 Normal
59 58 20.48 12.35 33785.72 77.81 3.13 Normal
60 59 37.92 7.64 31055.99 83.5 6.07 Normal
61 60 33.07 11.23 42196.8 83.13 6.89 Normal
62 61 31.71 12.99 30518.55 81.1 4.94 Normal
63 62 41.17 7.63 45221.32 80.47 4.39 Normal
64 63 37.82 13.3 43790.21 83.14 8.92 Normal
65 64 40.21 8.76 40455.62 78.24 4.61 Normal
66 65 40.97 6.73 39761.26 95.92 1.98 Normal
67 66 27.86 13.17 44264.89 81.24 6.89 Normal
68 67 39.88 10.3 27821.97 68.23 7.93 Normal
69 68 38.53 8.71 28396.02 79.13 3.31 Normal
70 69 31.78 10.98 36949.5 69.93 3.59 Normal
71 70 42.32 4.37 23415.33 67.82 4.36 Normal
72 71 43.11 13.43 46802.85 73.4 4.94 Normal
73 72 29.97 8.85 34851.89 78.23 2.52 Normal
74 73 45.19 9.56 47261.91 79.02 7.07 Normal
75 74 35.99 10.05 36792.74 75.98 2.68 Normal
76 75 40.25 15.8 38983.99 84.59 11.46 Normal
77 76 32.33 10.73 41978.57 96.9 2.58 Normal
78 77 31.6 1.5 30204.86 69.74 2.72 Normal
79 78 41.5 17.5 37734.05 84.5 3.27 Normal
80 79 31.02 4.72 38935.35 93.31 8.54 Normal
81 80 35.31 15.42 21294.92 73.85 8.71 Normal
82 81 24.06 10.02 30205.0 75.47 3.65 Normal
83 82 46.23 9.43 26459.04 76.2 6.54 Normal
84 83 47.12 3.86 26002.86 87.78 6.0 Normal
85 84 32.52 10.46 47409.22 86.79 5.2 Normal
86 85 60.61 9.0 49042.73 84.72 6.94 Normal
87 86 53.69 12.74 27280.61 85.58 -0.28 Normal
88 87 28.17 8.92 18686.14 82.18 9.51 Normal
89 88 40.74 4.48 48028.92 57.97 4.83 Normal
90 89 43.84 1.73 34738.44 74.11 1.09 Normal
91 90 46.7 6.24 37932.79 69.86 1.82 Normal
92 91 39.37 6.06 42641.14 80.04 3.41 Normal
93 92 32.07 5.86 34143.76 69.46 1.41 Normal
94 93 59.65 7.2 35282.11 77.14 4.66 Normal
95 94 37.79 13.03 39913.33 69.69 3.27 Normal
96 95 37.25 3.94 16584.63 88.67 9.93 Normal
97 96 37.51 11.25 39612.46 105.79 8.36 Normal
98 97 38.72 3.57 27355.68 77.03 2.73 Normal
99 98 25.78 5.67 29827.42 91.87 7.64 Normal
100 99 39.92 10.31 46839.55 66.39 9.57 Normal
101 100 45.39 9.24 26702.03 66.24 0.85 Normal
102 101 150.0 12.0 38000.0 78.0 6.0 Outlier_Uni_Age
103 102 45.0 15.0 500000.0 82.0 8.0 Outlier_Uni_Salaire
104 103 35.0 8.0 -5000.0 72.0 4.0 Outlier_Uni_Salaire_Neg
105 104 38.0 11.0 42000.0 150.0 7.0 Outlier_Uni_Perf
106 105 42.0 13.0 36000.0 76.0 -20.0 Outlier_Uni_Heures
107 201 25.0 30.0 32000.0 70.0 5.0 Outlier_Multi_Age_Exp
108 202 45.0 15.0 80000.0 40.0 2.0 Outlier_Multi_Salaire_Perf
109 203 28.0 1.0 95000.0 85.0 15.0 Outlier_Multi_Exp_Salaire
110 204 65.0 1.0 25000.0 60.0 0.0 Outlier_Multi_Senior_Junior
111 205 35.0 10.0 40000.0 100.0 0.0 Outlier_Multi_Perf_Heures
112 206 22.0 0.0 85000.0 95.0 0.0 Outlier_Multi_Impossible
113 301 22.0 0.0 0.0 0.0 0.0 Cas_Limie_Zeros
114 302 62.0 40.0 70000.0 88.0 3.0 Normal_Senior
115 303 25.0 2.0 15000.0 65.0 2.0 Normal_Salaire_Min

Binary file not shown.

View File

@@ -0,0 +1,80 @@
from fastapi.testclient import TestClient
from main import app
client = TestClient(app)
def test_outlier_detection_univariate():
# Data with one clear outlier (100)
data = [
{"val": 10}, {"val": 11}, {"val": 12}, {"val": 10}, {"val": 100}
]
request_data = {
"data": data,
"columns": ["val"],
"method": "univariate"
}
response = client.post("/api/v1/analysis/detect-outliers", json=request_data)
assert response.status_code == 200
json_resp = response.json()
assert json_resp["total_count"] == 1
assert json_resp["outliers"][0]["index"] == 4
assert "IQR bounds" in json_resp["outliers"][0]["reasons"][0]
def test_outlier_detection_multivariate():
# Data with a multivariate anomaly
data = [
{"x": 1, "y": 1}, {"x": 1.1, "y": 0.9}, {"x": 0.9, "y": 1.1},
{"x": 10, "y": 10} # Anomaly
]
request_data = {
"data": data,
"columns": ["x", "y"],
"method": "multivariate"
}
response = client.post("/api/v1/analysis/detect-outliers", json=request_data)
assert response.status_code == 200
json_resp = response.json()
assert json_resp["total_count"] >= 1
def test_feature_importance():
data = [
{"x1": 1, "x2": 10, "y": 2},
{"x1": 2, "x2": 20, "y": 4},
{"x1": 3, "x2": 30, "y": 6},
{"x1": 4, "x2": 40, "y": 8},
{"x1": 5, "x2": 50, "y": 10}
]
request_data = {
"data": data,
"features": ["x1", "x2"],
"target": "y"
}
response = client.post("/api/v1/analysis/feature-importance", json=request_data)
assert response.status_code == 200
json_resp = response.json()
assert len(json_resp["importances"]) == 2
assert "feature" in json_resp["importances"][0]
assert "score" in json_resp["importances"][0]
def test_run_regression():
data = [
{"x": 1, "y": 2.1},
{"x": 2, "y": 3.9},
{"x": 3, "y": 6.2},
{"x": 4, "y": 8.1},
{"x": 5, "y": 10.3}
]
request_data = {
"data": data,
"x_features": ["x"],
"y_target": "y",
"model_type": "linear"
}
response = client.post("/api/v1/analysis/run-regression", json=request_data)
assert response.status_code == 200
json_resp = response.json()
assert "results" in json_resp
assert json_resp["results"]["r_squared"] > 0.9
assert "const" in json_resp["results"]["coefficients"]

View File

@@ -0,0 +1,37 @@
from fastapi.testclient import TestClient
import pandas as pd
import io
import pyarrow as pa
from main import app
client = TestClient(app)
def test_health_check():
response = client.get("/health")
assert response.status_code == 200
assert response.json() == {"status": "ok", "service": "backend"}
def test_upload_csv():
# Create a dummy CSV
csv_content = "name,age\nAlice,30\nBob,25"
file = ("test.csv", csv_content, "text/csv")
response = client.post("/api/v1/upload", files={"file": file})
assert response.status_code == 200
assert response.headers["content-type"] == "application/vnd.apache.arrow.stream"
assert "X-Column-Metadata" in response.headers
# Verify Arrow data
buffer = io.BytesIO(response.content)
with pa.ipc.open_stream(buffer) as reader:
table = reader.read_all()
df = table.to_pandas()
assert df.shape == (2, 2)
assert list(df.columns) == ["name", "age"]
def test_upload_invalid_format():
file = ("test.txt", "invalid content", "text/plain")
response = client.post("/api/v1/upload", files={"file": file})
assert response.status_code == 400
assert "Only .xlsx, .xls and .csv files are supported" in response.json()["detail"]

1101
backend/uv.lock generated Normal file

File diff suppressed because it is too large Load Diff