add pdf parsing

Update requirements.txt to include additional dependencies and versions
Update testvideoYoutube notebook to replace ChatOpenAI with ChatOllama and enhance French summary
2025-04-21 15:27:32 +02:00 · 2025-03-02 11:25:21 +01:00 · 2025-03-02 00:57:18 +01:00 · 2025-03-02 00:49:04 +01:00
4 changed files with 1425 additions and 251 deletions
--- a/pdf_to_latex.py
+++ b/pdf_to_latex.py
@@ -0,0 +1,118 @@
+import os
+import fitz  # PyMuPDF
+import logging
+from PIL import Image
+import io
+import tempfile
+from pix2text import Pix2Text
+import re
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+logger = logging.getLogger(__name__)
+
+class LatexExtractor:
+    def __init__(self):
+        # Initialize Pix2Text with LaTeX OCR engine
+        self.p2t = Pix2Text(math_engine='mfd', math_dpi=150)
+        logger.info("Initialized Pix2Text with LaTeX OCR engine")
+    
+    def detect_equations_from_image(self, image_path):
+        """Detect and extract LaTeX equations from an image"""
+        logger.info(f"Processing image: {image_path}")
+        
+        try:
+            # Process image with Pix2Text
+            result = self.p2t.recognize(image_path)
+            
+            # Extract math blocks (LaTeX equations)
+            math_blocks = []
+            for item in result:
+                if item.get('type') == 'math' and item.get('text'):
+                    math_blocks.append(item.get('text'))
+            
+            logger.info(f"Extracted {len(math_blocks)} LaTeX equations from image")
+            return math_blocks
+        
+        except Exception as e:
+            logger.error(f"Error extracting LaTeX from image: {str(e)}")
+            return []
+    
+    def extract_equations_from_pdf(self, pdf_path, output_dir=None):
+        """Extract LaTeX equations from each page of a PDF"""
+        logger.info(f"Processing PDF: {pdf_path}")
+        
+        if output_dir is None:
+            output_dir = os.path.join(os.path.dirname(pdf_path), "equations")
+        
+        os.makedirs(output_dir, exist_ok=True)
+        
+        # Open the PDF
+        doc = fitz.open(pdf_path)
+        logger.info(f"PDF opened successfully. Document has {len(doc)} pages")
+        
+        all_equations = []
+        
+        # Process each page
+        for page_num, page in enumerate(doc, 1):
+            logger.info(f"Processing page {page_num}/{len(doc)}")
+            
+            # Render page to image
+            pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # Higher resolution for better OCR
+            
+            # Save the page image to a temporary file
+            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+                pix.save(tmp.name)
+                tmp_path = tmp.name
+            
+            # Process the page image to extract equations
+            page_equations = self.detect_equations_from_image(tmp_path)
+            
+            # Add page number information to each equation
+            for i, eq in enumerate(page_equations):
+                all_equations.append({
+                    "page": page_num,
+                    "index": i+1,
+                    "latex": eq
+                })
+            
+            # Clean up temporary file
+            os.unlink(tmp_path)
+        
+        # Save all equations to a Markdown file
+        md_path = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(pdf_path))[0]}_equations.md")
+        with open(md_path, "w", encoding="utf-8") as f:
+            f.write(f"# Equations from {os.path.basename(pdf_path)}\n\n")
+            
+            for eq in all_equations:
+                f.write(f"## Page {eq['page']} - Equation {eq['index']}\n\n")
+                f.write(f"$$\n{eq['latex']}\n$$\n\n")
+        
+        logger.info(f"Extracted {len(all_equations)} equations. Saved to {md_path}")
+        return all_equations
+
+def main():
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="Extract LaTeX equations from PDF documents")
+    parser.add_argument("pdf_path", help="Path to the PDF file")
+    parser.add_argument("--output_dir", help="Directory to save extracted equations", default=None)
+    parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging (DEBUG level)")
+    
+    args = parser.parse_args()
+    
+    # Set log level
+    if args.verbose:
+        logging.getLogger().setLevel(logging.DEBUG)
+        logger.info("Verbose logging enabled")
+    
+    # Extract equations
+    extractor = LatexExtractor()
+    extractor.extract_equations_from_pdf(args.pdf_path, args.output_dir)
+    
+if __name__ == "__main__":
+    main()
--- a/pdf_to_markdown.py
+++ b/pdf_to_markdown.py
@@ -0,0 +1,426 @@
+import os
+import pymupdf  # PyMuPDF
+import re
+import logging
+import fitz  # PyMuPDF
+import tempfile
+from PIL import Image
+import requests
+import base64
+import io
+from pathlib import Path
+import importlib.util
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+logger = logging.getLogger(__name__)
+
+# Configuration pour l'API Ollama ou autre modèle d'IA
+OLLAMA_API_URL = "http://localhost:11434/api/generate"
+OLLAMA_MODEL = "PetrosStav/gemma3-tools:4b"  # ou autre modèle multimodal
+
+# Check if pix2text is available for LaTeX extraction
+try:
+    from pix2text import Pix2Text
+    PIX2TEXT_AVAILABLE = True
+    logger.info("Pix2Text is available - will use it for LaTeX equations")
+except ImportError:
+    PIX2TEXT_AVAILABLE = False
+    logger.warning("Pix2Text not found - LaTeX equations will be extracted using basic methods")
+
+def image_to_base64(image_path):
+    """Convertit une image en base64 pour l'API Ollama"""
+    with open(image_path, "rb") as img_file:
+        return base64.b64encode(img_file.read()).decode("utf-8")
+
+def get_image_description(image_path, language="english"):
+    """Utilise l'API Ollama pour décrire une image dans la langue demandée"""
+    try:
+        base64_image = image_to_base64(image_path)
+        
+        # Adjust prompt based on language
+        if language.lower() == "french":
+            prompt = "Décris cette image en détail. S'il s'agit d'un graphique, d'un diagramme ou d'une figure, explique ce qu'elle représente avec précision."
+        else:
+            prompt = "Describe this image in detail. If this is a chart, diagram, or figure, explain what it represents precisely."
+            
+        response = requests.post(
+            OLLAMA_API_URL,
+            json={
+                "model": OLLAMA_MODEL,
+                "prompt": prompt,
+                "images": [base64_image],
+                "stream": False
+            }
+        )
+        
+        if response.status_code == 200:
+            return response.json()["response"].strip()
+        else:
+            logger.error(f"Ollama API error: {response.status_code} - {response.text}")
+            return "Error generating description from image."
+    except Exception as e:
+        logger.error(f"Error in image description generation: {str(e)}")
+        return f"Description not available: {str(e)}"
+
+def extract_images_from_page(page, output_dir, pdf_name, page_num):
+    """Extrait les images d'une page de PDF en utilisant la méthode de rendu de page
+    au lieu d'extraire directement les images intégrées (qui peuvent être noires)"""
+    images_paths = []
+    
+    # Méthode 1: Extraction directe (peut donner des images noires)
+    try:
+        embedded_images = page.get_images(full=True)
+        logger.info(f"Found {len(embedded_images)} embedded images on page {page_num}")
+        
+        for img_index, img in enumerate(embedded_images, 1):
+            try:
+                xref = img[0]
+                base_image = page.parent.extract_image(xref)
+                if base_image:
+                    image_bytes = base_image["image"]
+                    ext = base_image["ext"]
+                    
+                    # Chemin de l'image
+                    image_filename = f"{pdf_name}-page{page_num}-embed{img_index}.{ext}"
+                    image_path = os.path.join(output_dir, image_filename)
+                    
+                    # Sauvegarder l'image
+                    with open(image_path, "wb") as img_file:
+                        img_file.write(image_bytes)
+                    logger.info(f"Embedded image saved: {image_path}")
+                    
+                    # Vérifier si l'image n'est pas noire
+                    pil_img = Image.open(image_path)
+                    if is_image_mostly_black(pil_img):
+                        logger.warning(f"Image {image_path} appears to be mostly black, will be ignored")
+                    else:
+                        images_paths.append(image_path)
+            except Exception as e:
+                logger.error(f"Error extracting embedded image {img_index} on page {page_num}: {str(e)}")
+    except Exception as e:
+        logger.error(f"Error extracting embedded images from page {page_num}: {str(e)}")
+    
+    # Méthode 2: Rendu de page entière (meilleure qualité, fonctionne même si les images sont noires)
+    try:
+        # Rendre la page entière en haute résolution
+        zoom = 2  # Facteur de zoom pour une meilleure résolution
+        mat = fitz.Matrix(zoom, zoom)
+        pix = page.get_pixmap(matrix=mat)
+        
+        # Sauvegarder l'image de la page entière
+        page_image_filename = f"{pdf_name}-page{page_num}-full.png"
+        page_image_path = os.path.join(output_dir, page_image_filename)
+        pix.save(page_image_path)
+        logger.info(f"Full page image saved: {page_image_path}")
+        
+        # Ajouter le chemin de l'image de la page entière
+        images_paths.append(page_image_path)
+        
+        # Méthode 3: Extraction des zones d'image sur la page
+        # Cette méthode utilise une heuristique pour détecter les zones rectangulaires 
+        # qui pourraient contenir des images, graphiques ou diagrammes
+        rect_areas = detect_image_areas(page)
+        
+        for i, rect in enumerate(rect_areas, 1):
+            try:
+                # Découper une région de la page
+                clip_pix = page.get_pixmap(matrix=mat, clip=rect)
+                
+                # Sauvegarder l'image découpée
+                clip_filename = f"{pdf_name}-page{page_num}-clip{i}.png"
+                clip_path = os.path.join(output_dir, clip_filename)
+                clip_pix.save(clip_path)
+                
+                # Vérifier si l'image n'est pas noire et si elle est assez grande
+                pil_img = Image.open(clip_path)
+                if pil_img.width > 100 and pil_img.height > 100 and not is_image_mostly_black(pil_img):
+                    logger.info(f"Detected image area saved: {clip_path}")
+                    images_paths.append(clip_path)
+                else:
+                    # Supprimer les petites zones ou les zones noires
+                    os.remove(clip_path)
+                    logger.info(f"Image area too small or black, ignored: {clip_path}")
+            except Exception as e:
+                logger.error(f"Error extracting image area {i} on page {page_num}: {str(e)}")
+    except Exception as e:
+        logger.error(f"Error rendering page {page_num}: {str(e)}")
+    
+    return images_paths
+
+def is_image_mostly_black(image, threshold=0.95):
+    """Vérifie si une image est principalement noire"""
+    # Convertir en niveaux de gris
+    if image.mode != 'L':
+        image = image.convert('L')
+    
+    # Compter les pixels noirs
+    pixels = image.getdata()
+    black_pixels = sum(1 for pixel in pixels if pixel < 20)
+    total_pixels = len(pixels)
+    
+    # Vérifier le ratio de pixels noirs
+    return black_pixels / total_pixels > threshold
+
+def detect_image_areas(page):
+    """Détecte les zones potentielles d'images sur une page"""
+    # Cette fonction est une heuristique simple pour détecter les zones 
+    # qui pourraient contenir des images, des graphiques ou des diagrammes
+    
+    # Obtenir les blocs de la page
+    blocks = page.get_text("dict")["blocks"]
+    
+    # Filtrer les blocs qui ne sont pas du texte
+    image_areas = []
+    
+    for block in blocks:
+        # Les blocs d'images ont généralement un type différent de 0 (texte)
+        if block["type"] != 0:
+            rect = fitz.Rect(block["bbox"])
+            # Ignorer les zones trop petites
+            if rect.width > 50 and rect.height > 50:
+                image_areas.append(rect)
+    
+    # Si aucune zone n'est détectée, essayer une approche différente
+    if not image_areas:
+        # Diviser la page en sections et considérer les sections 
+        # qui ne contiennent pas de texte comme des candidats potentiels
+        page_rect = page.rect
+        text_areas = []
+        
+        # Obtenir les zones de texte
+        for block in blocks:
+            if block["type"] == 0:  # Bloc de texte
+                text_areas.append(fitz.Rect(block["bbox"]))
+        
+        # Si nous avons des zones de texte, considérer le reste comme potentielles zones d'image
+        if text_areas:
+            # Une heuristique simple: diviser la page en 4 quadrants
+            mid_x = page_rect.width / 2
+            mid_y = page_rect.height / 2
+            
+            quadrants = [
+                fitz.Rect(0, 0, mid_x, mid_y),
+                fitz.Rect(mid_x, 0, page_rect.width, mid_y),
+                fitz.Rect(0, mid_y, mid_x, page_rect.height),
+                fitz.Rect(mid_x, mid_y, page_rect.width, page_rect.height)
+            ]
+            
+            # Vérifier chaque quadrant
+            for quad in quadrants:
+                # Vérifier si le quadrant contient du texte
+                contains_text = any(quad.intersects(text_area) for text_area in text_areas)
+                
+                if not contains_text and quad.width > 100 and quad.height > 100:
+                    image_areas.append(quad)
+    
+    return image_areas
+
+def extract_latex_from_text(text):
+    """Extract and enhance mathematical equations from text using basic pattern matching"""
+    # Find potential equations in the text
+    equation_patterns = [
+        # Expressions containing these characters are likely equations
+        r'[=<>+\-*/±≈≤≥]',
+        # Common mathematical notations
+        r'[a-zA-Z][_^]',
+        # Fractions, integrals, etc.
+        r'\\frac|\\int|\\sum|\\prod|\\sqrt',
+        # Greek letters
+        r'\\alpha|\\beta|\\gamma|\\delta|\\epsilon|\\theta|\\lambda|\\mu|\\pi',
+        # Already formatted LaTeX
+        r'\$\$.*?\$\$|\$.*?\$'
+    ]
+    
+    # Search line by line
+    lines = text.splitlines()
+    latex_chunks = []
+    
+    for line in lines:
+        line = line.strip()
+        # Skip lines that are too long (probably not equations)
+        if len(line) > 150:
+            continue
+            
+        # Check if the line contains a potential equation
+        is_equation = False
+        for pattern in equation_patterns:
+            if re.search(pattern, line):
+                is_equation = True
+                break
+        
+        if is_equation:
+            # Clean the equation
+            eq = line.replace('$$', '').replace('$', '')
+            
+            # Improve LaTeX formatting
+            eq = format_equation_for_latex(eq)
+            latex_chunks.append(eq)
+            
+    return latex_chunks
+
+def extract_latex_with_pix2text(page_image_path):
+    """Extract LaTeX equations from an image using Pix2Text"""
+    if not PIX2TEXT_AVAILABLE:
+        logger.warning("Pix2Text is not available. Install it with: pip install pix2text")
+        return []
+        
+    try:
+        # Initialize Pix2Text with LaTeX OCR capabilities
+        p2t = Pix2Text(math_engine='mfd')
+        
+        # Process the image
+        result = p2t.recognize(page_image_path)
+        
+        # Extract math blocks
+        equations = []
+        for item in result:
+            if item.get('type') == 'math' and item.get('text'):
+                equations.append(item.get('text'))
+        
+        logger.info(f"Extracted {len(equations)} equations using Pix2Text")
+        return equations
+    except Exception as e:
+        logger.error(f"Error extracting equations with Pix2Text: {str(e)}")
+        return []
+
+def format_equation_for_latex(eq_text):
+    """Improves LaTeX formatting of equations"""
+    # 1. Fix subscripts
+    eq_text = re.sub(r'([a-zA-Z])_([a-zA-Z0-9]+)', r'\1_{(\2)}', eq_text)
+    
+    # 2. Fix superscripts
+    eq_text = re.sub(r'([a-zA-Z0-9])(\^)([a-zA-Z0-9]+)', r'\1\2{(\3)}', eq_text)
+    
+    # 3. Remove equation numbers
+    eq_text = re.sub(r'\((\d+)\)$', r'', eq_text).strip()
+    
+    # 4. Convert simple fractions to \frac
+    fraction_match = re.search(r'([a-zA-Z0-9]+)\s*/\s*([a-zA-Z0-9]+)', eq_text)
+    if fraction_match:
+        numerator, denominator = fraction_match.groups()
+        eq_text = eq_text.replace(f"{numerator}/{denominator}", f"\\frac{{{numerator}}}{{{denominator}}}")
+    
+    # 5. Add spaces around operators
+    operators = ['+', '-', '=', '<', '>', '\\approx', '\\sim', '\\equiv']
+    for op in operators:
+        if op != '-':  # Avoid modifying negative signs
+            eq_text = eq_text.replace(op, f" {op} ")
+    
+    # Remove double spaces
+    while '  ' in eq_text:
+        eq_text = eq_text.replace('  ', ' ')
+        
+    return eq_text.strip()
+
+def process_pdf_to_markdown(pdf_path, output_md_path, output_dir="output", lang="english"):
+    """Process a PDF and generate a Markdown file with text, images, and LaTeX equations"""
+    logger.info(f"Processing PDF: {pdf_path}")
+    
+    # Create output directory if it doesn't exist
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # Open the PDF with PyMuPDF
+    doc = fitz.open(pdf_path)
+    logger.info(f"PDF opened successfully. Document has {len(doc)} pages")
+    
+    # Initialize Markdown content
+    md_content = []
+    md_content.append(f"# {os.path.splitext(os.path.basename(pdf_path))[0]}\n")
+    
+    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
+    
+    # Process each page for text, images, and equations
+    for page_num, page in enumerate(doc, 1):
+        logger.info(f"Processing page {page_num}/{len(doc)}")
+        
+        # Extract text
+        text = page.get_text("text")
+        logger.info(f"Extracted {len(text)} characters of text from page {page_num}")
+        
+        # Add text to Markdown
+        md_content.append(f"## Page {page_num}\n")
+        md_content.append(f"{text.strip()}\n")
+        
+        # Extract images using multiple methods to ensure they're not black
+        image_paths = extract_images_from_page(page, output_dir, pdf_name, page_num)
+        logger.info(f"Extracted {len(image_paths)} images from page {page_num}")
+        
+        # Process each extracted image
+        for img_index, image_path in enumerate(image_paths, 1):
+            try:
+                # Generate image description
+                logger.info(f"Generating description for image {img_index} on page {page_num}")
+                description = get_image_description(image_path, language=lang)
+                logger.info(f"Description generated: {description[:50]}..." if len(description) > 50 else f"Description generated: {description}")
+                
+                # Add image and description to Markdown
+                md_content.append(f"\n![Image {page_num}-{img_index}]({image_path})\n")
+                md_content.append(f"**Description:** {description}\n")
+            except Exception as e:
+                logger.error(f"Error processing image {img_index} on page {page_num}: {str(e)}")
+        
+        # Extract and enhance equations - use Pix2Text if available
+        logger.info(f"Extracting equations from page {page_num}")
+        latex_equations = []
+        
+        if PIX2TEXT_AVAILABLE:
+            # Render the page to an image for Pix2Text processing
+            temp_pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
+            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+                temp_pix.save(tmp.name)
+                temp_path = tmp.name
+                
+            # Extract equations using Pix2Text
+            latex_equations = extract_latex_with_pix2text(temp_path)
+            
+            # Clean up temp file
+            os.unlink(temp_path)
+        else:
+            # Fallback to basic extraction
+            latex_equations = extract_latex_from_text(text)
+            
+        logger.info(f"Found {len(latex_equations)} potential equations on page {page_num}")
+        
+        for i, eq in enumerate(latex_equations, 1):
+            try:
+                logger.info(f"Equation {i}: {eq[:30]}..." if len(eq) > 30 else f"Equation {i}: {eq}")
+                
+                # Add equation to Markdown
+                md_content.append(f"\n$$\n{eq}\n$$\n")
+            except Exception as e:
+                logger.error(f"Error formatting equation {i} on page {page_num}: {str(e)}")
+    
+    # Write content to Markdown file
+    with open(output_md_path, "w", encoding="utf-8") as md_file:
+        md_file.write("\n".join(md_content))
+    
+    logger.info(f"Markdown file generated: {output_md_path}")
+    print(f"Conversion complete. Markdown file generated: {output_md_path}")
+    
+    return output_md_path
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="Convert PDF to Markdown with text, images, and LaTeX equations")
+    parser.add_argument("pdf_path", help="Path to the PDF file")
+    parser.add_argument("--output_md", default="output.md", help="Path to output Markdown file")
+    parser.add_argument("--output_dir", default="output", help="Directory for extracted images")
+    parser.add_argument("--language", default="english", choices=["english", "french"], 
+                        help="Language for image descriptions (english or french)")
+    parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging (DEBUG level)")
+    
+    args = parser.parse_args()
+    
+    # Set log level
+    if args.verbose:
+        logging.getLogger().setLevel(logging.DEBUG)
+        logger.info("Verbose logging enabled")
+    
+    # Process PDF
+    process_pdf_to_markdown(args.pdf_path, args.output_md, args.output_dir, args.language)
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,42 +1,232 @@
-# Core RAG and LLM libraries
-langchain>=0.0.267
-langchain-community>=0.0.10
-transformers>=4.30.0
-langchain_community
-
-# Document processing
-unstructured>=0.10.0
-pdf2image>=1.16.3
-pypdf2>=3.0.0
-pdfminer.six>=20221105
-
-# OCR and image processing
-pytesseract>=0.3.10
-Pillow>=9.5.0
-opencv-python>=4.8.0
-
-# Table extraction
-camelot-py>=0.11.0
-tabula-py>=2.7.0
-
-# Data manipulation
-pandas>=2.0.0
-numpy
-
-# Visualization
-matplotlib>=3.7.0
-
-# Optional but commonly used with RAG
-scikit-learn>=1.2.0
-sentence-transformers>=2.2.2
-
-# Vector database connections (common choices, uncomment as needed)
-# chromadb>=0.4.6
-# pinecone-client>=2.2.2
-# qdrant-client>=1.3.0
-# faiss-cpu>=1.7.4
-
-# Utilities
-tqdm>=4.65.0
-python-dotenv>=1.0.0
-pi_heif
+acres==0.3.0
+aiofiles==24.1.0
+aiohappyeyeballs==2.4.6
+aiohttp==3.11.13
+aiosignal==1.3.2
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+anyio==4.8.0
+asttokens==3.0.0
+attrs==25.1.0
+backoff==2.2.1
+beautifulsoup4==4.13.3
+cachetools==5.5.2
+camelot-py==1.0.0
+certifi==2025.1.31
+cffi==1.17.1
+chardet==5.2.0
+charset-normalizer==3.4.1
+ci-info==0.3.0
+click==8.1.8
+coloredlogs==15.0.1
+comm==0.2.2
+configobj==5.0.9
+configparser==7.1.0
+contourpy==1.3.1
+cryptography==44.0.1
+cycler==0.12.1
+dataclasses-json==0.6.7
+debugpy==1.8.12
+decorator==5.2.1
+Deprecated==1.2.18
+distro==1.9.0
+effdet==0.4.1
+emoji==2.14.1
+et_xmlfile==2.0.0
+etelemetry==0.3.1
+eval_type_backport==0.2.2
+executing==2.2.0
+filelock==3.17.0
+filetype==1.2.0
+flatbuffers==25.2.10
+fonttools==4.56.0
+frontend==0.0.3
+frozenlist==1.5.0
+fsspec==2025.2.0
+google-api-core==2.24.1
+google-auth==2.38.0
+google-cloud-vision==3.10.0
+googleapis-common-protos==1.68.0
+greenlet==3.1.1
+grpcio==1.71.0rc2
+grpcio-status==1.71.0rc2
+grpcio-tools==1.70.0
+h11==0.14.0
+h2==4.2.0
+hpack==4.1.0
+html5lib==1.1
+httpcore==1.0.7
+httplib2==0.22.0
+httpx==0.28.1
+httpx-sse==0.4.0
+huggingface-hub==0.29.1
+humanfriendly==10.0
+hyperframe==6.1.0
+idna==3.10
+ipykernel==6.29.5
+ipython==9.0.0
+ipython_pygments_lexers==1.1.1
+ipywidgets==8.1.5
+isodate==0.6.1
+itsdangerous==2.2.0
+jedi==0.19.2
+Jinja2==3.1.5
+jiter==0.8.2
+joblib==1.4.2
+jsonpatch==1.33
+jsonpointer==3.0.0
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyterlab_widgets==3.0.13
+kiwisolver==1.4.8
+langchain==0.3.19
+langchain-community==0.3.18
+langchain-core==0.3.40
+langchain-deepseek==0.1.2
+langchain-ollama==0.2.3
+langchain-openai==0.3.7
+langchain-qdrant==0.2.0
+langchain-text-splitters==0.3.6
+langdetect==1.0.9
+langsmith==0.3.11
+looseversion==1.3.0
+lxml==5.3.1
+Markdown==3.7
+MarkupSafe==3.0.2
+marshmallow==3.26.1
+matplotlib==3.10.1
+matplotlib-inline==0.1.7
+mpmath==1.3.0
+multidict==6.1.0
+mypy-extensions==1.0.0
+nest-asyncio==1.6.0
+networkx==3.4.2
+nibabel==5.3.2
+nipype==1.9.2
+nltk==3.9.1
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusparselt-cu12==0.6.2
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+olefile==0.47
+ollama==0.4.7
+omegaconf==2.3.0
+onnx==1.17.0
+onnxruntime==1.20.1
+openai==1.65.2
+opencv-python==4.11.0.86
+opencv-python-headless==4.11.0.86
+openpyxl==3.1.5
+orjson==3.10.15
+packaging==24.2
+pandas==2.2.3
+parso==0.8.4
+pathlib==1.0.1
+pdf2image==1.17.0
+pdfminer.six==20240706
+pexpect==4.9.0
+pi_heif==0.21.0
+pikepdf==9.5.2
+pillow==11.1.0
+platformdirs==4.3.6
+portalocker==2.10.1
+prompt_toolkit==3.0.50
+propcache==0.3.0
+proto-plus==1.26.0
+protobuf==5.29.3
+prov==2.0.1
+psutil==7.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+puremagic==1.28
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pycocotools==2.0.8
+pycparser==2.22
+pydantic==2.10.6
+pydantic-settings==2.8.1
+pydantic_core==2.27.2
+pydot==3.0.4
+Pygments==2.19.1
+PyMuPDF==1.25.3
+pymupdf4llm==0.0.17
+pypandoc==1.15
+pyparsing==3.2.1
+pypdf==5.3.0
+PyPDF2==3.0.1
+pypdfium2==4.30.1
+pytesseract==0.3.13
+python-dateutil==2.9.0.post0
+python-docx==1.1.2
+python-dotenv==1.0.1
+python-iso639==2025.2.18
+python-magic==0.4.27
+python-multipart==0.0.20
+python-oxmsg==0.0.2
+python-pptx==1.0.2
+pytz==2025.1
+pyxnat==1.6.3
+PyYAML==6.0.2
+pyzmq==26.2.1
+qdrant-client==1.13.2
+RapidFuzz==3.12.1
+rdflib==6.3.2
+regex==2024.11.6
+requests==2.32.3
+requests-toolbelt==1.0.0
+rsa==4.9
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.15.2
+sentence-transformers==3.4.1
+setuptools==75.8.2
+simplejson==3.20.1
+six==1.17.0
+sniffio==1.3.1
+soupsieve==2.6
+SQLAlchemy==2.0.38
+stack-data==0.6.3
+starlette==0.46.0
+sympy==1.13.1
+tabula-py==2.10.0
+tabulate==0.9.0
+tenacity==9.0.0
+threadpoolctl==3.5.0
+tiktoken==0.9.0
+timm==1.0.15
+tokenizers==0.21.0
+torch==2.6.0
+torchvision==0.21.0
+tornado==6.4.2
+tqdm==4.67.1
+traitlets==5.14.3
+traits==7.0.2
+transformers==4.49.0
+triton==3.2.0
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2025.1
+unstructured==0.16.23
+unstructured-client==0.30.6
+unstructured-inference==0.8.7
+unstructured.pytesseract==0.3.13
+urllib3==2.3.0
+uvicorn==0.34.0
+wcwidth==0.2.13
+webencodings==0.5.1
+widgetsnbextension==4.0.13
+wrapt==1.17.2
+xlrd==2.0.1
+XlsxWriter==3.2.2
+yarl==1.18.3
+zstandard==0.23.0
--- a/testvideoYoutube.ipynb
+++ b/testvideoYoutube.ipynb
Author	SHA1	Message	Date
sepehr	74c720c7ba	add pdf parsing	2025-04-21 15:27:32 +02:00
sepehr	e44c929ce7	Update requirements.txt to include additional dependencies and versions	2025-03-02 11:25:21 +01:00
sepehr	b7e2ded889	Update testvideoYoutube notebook to replace ChatOpenAI with ChatOllama and enhance French summary	2025-03-02 00:57:18 +01:00
sepehr	8292dc15b3	Implement initial project structure	2025-03-02 00:49:04 +01:00