#!/usr/bin/env python3
"""
Enhanced link scanner for PHP migration tracking system.
Scans PHP files for includes/requires and populates the database links table.
"""

import os
import re
import sys
from pathlib import Path
from typing import List, Dict, Set, Optional
from db_utils import MigrationDB

class LinkScanner:
    """Scanner for PHP file dependencies."""
    
    def __init__(self, base_path: str = '../vmserver10/intranet'):
        self.base_path = Path(base_path).resolve()
        self.db = MigrationDB()
        
        # Patterns for finding PHP includes/requires and references
        self.include_patterns = [
            # Standard PHP includes/requires
            r'include\s*\(\s*[\'"]([^\'"]*\.php[^\'"]*)[\'"]\s*\)',
            r'include_once\s*\(\s*[\'"]([^\'"]*\.php[^\'"]*)[\'"]\s*\)',
            r'require\s*\(\s*[\'"]([^\'"]*\.php[^\'"]*)[\'"]\s*\)',
            r'require_once\s*\(\s*[\'"]([^\'"]*\.php[^\'"]*)[\'"]\s*\)',
            
            # General pattern for PHP files in quotes (handles escaped quotes)
            r'[\"\']((?:\.\./|/intranet/|[a-zA-Z_])[^\"\']*\.php)',
            
            # JavaScript variable assignments with PHP files
            r'link\s*=\s*[\"\'](.*?\.php)',
            
            # XMLHttpRequest.open calls
            r'xmlhttp\.open\s*\([^,]+,\s*[\"\'](.*?\.php)',
            
            # Window.location assignments
            r'window\.location\s*[.=]\s*[\"\'](.*?\.php)',
            r'window\.open\s*\(\s*[\"\'](.*?\.php)',
            
            # Form actions
            r'action\s*=\s*[\"\'](.*?\.php)',
            
            # Href attributes
            r'href\s*=\s*[\"\'](.*?\.php)',
        ]
    
    def normalize_include_path(self, include_path: str, source_file_path: str) -> Optional[str]:
        """
        Normalize an include path to a standard format.

        Args:
            include_path: Raw include path from PHP file
            source_file_path: Path of the file containing the include

        Returns:
            Normalized path or None if invalid
        """
        # Clean up the path
        include_path = include_path.strip('\'"')

        # Handle specific malformed links found during scanning
        if include_path == '/$baseurl/intranet/databases/crm/HiddenRemovedContacts.php':
            return '/intranet/databases/crm/HiddenRemovedContacts.php'
        if include_path == '/<?php print "$baseurl";?>/intranet/databases/crm/crmlinkeddocuments.php':
            return '/intranet/databases/crm/crmlinkeddocuments.php'

        # Handle absolute server paths like /var/www/vmserver10/intranet/
        if '/vmserver10/intranet/' in include_path:
            # Extract the part after /vmserver10/intranet/
            parts = include_path.split('/vmserver10/intranet/')
            if len(parts) > 1:
                rel_path = parts[-1]  # Get the last part
                return f"/intranet/{rel_path}"
        
        # Handle absolute paths starting with /
        if include_path.startswith('/'):
            if include_path.startswith('/intranet/'):
                return include_path
            elif include_path.startswith('/databases/'):
                return f"/intranet{include_path}"
            elif include_path.startswith('/operainfo/'):
                return f"/intranet{include_path}"
            elif include_path.startswith('/functions/'):
                return f"/intranet{include_path}"
            elif include_path.startswith('/includes/'):
                return f"/intranet{include_path}"
            elif include_path.startswith('/images/'):
                return f"/intranet{include_path}"
            else:
                # Check if it's a server path we can extract from
                if 'intranet/' in include_path:
                    # Extract everything after the last 'intranet/'
                    parts = include_path.split('intranet/')
                    if len(parts) > 1:
                        rel_path = parts[-1]
                        return f"/intranet/{rel_path}"
                
                # Try to map other absolute paths
                return f"/intranet{include_path}"
        
        # Handle relative paths
        if include_path.startswith('../') or include_path.startswith('./'):
            # Resolve relative to source file directory
            source_dir = os.path.dirname(source_file_path)
            resolved_path = os.path.normpath(os.path.join(source_dir, include_path))
            
            # Convert to intranet-relative path
            try:
                rel_to_intranet = os.path.relpath(resolved_path, self.base_path)
                if not rel_to_intranet.startswith('..'):
                    return f"/intranet/{rel_to_intranet}"
            except ValueError:
                pass
        else:
            # Check if it's just a filename without any path
            if '/' not in include_path:
                # File without path - use same directory as source file
                source_dir = os.path.dirname(source_file_path)
                # Get the relative path from base_path to source directory
                try:
                    rel_source_dir = os.path.relpath(source_dir, self.base_path)
                    if not rel_source_dir.startswith('..'):
                        return f"/intranet/{rel_source_dir}/{include_path}"
                except ValueError:
                    pass
            else:
                # Relative path without ./ or ../
                return f"/intranet/{include_path}"
        
        return None
    
    def scan_file_content(self, file_path: str) -> List[str]:
        """
        Scan a PHP file for include/require statements.
        
        Args:
            file_path: Path to PHP file to scan
        
        Returns:
            List of normalized include paths
        """
        if not os.path.exists(file_path):
            return []
        
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            return []
        
        found_includes = set()
        
        # Search for include patterns
        for pattern in self.include_patterns:
            matches = re.findall(pattern, content, re.IGNORECASE)
            for match in matches:
                normalized = self.normalize_include_path(match, file_path)
                if normalized:
                    found_includes.add(normalized)
        
        return list(found_includes)
    
    def scan_file_by_id(self, file_id: int) -> int:
        """
        Scan a file by database ID and update links.
        
        Args:
            file_id: Database ID of file to scan
        
        Returns:
            Number of links found
        """
        # Get file record
        file_record = self.db.get_file_by_id(file_id)
        if not file_record:
            print(f"File ID {file_id} not found in database")
            return 0
        
        # Construct file path
        file_path = self.db.get_full_filesystem_path(file_record, str(self.base_path))
        
        # Scan for includes
        includes = self.scan_file_content(file_path)
        
        # Clear existing links
        self.db.clear_links_for_file(file_id)
        
        # Add new links and check if target files exist in database
        new_files_added = 0
        for include_path in includes:
            self.db.add_link(file_id, include_path)
            
            # Check if target file exists in database
            if include_path.startswith('/intranet/'):
                rel_path = include_path[10:]  # Remove /intranet/
                parts = rel_path.split('/')
                
                if len(parts) == 1:
                    primary, sub, name = '', '', parts[0]
                elif len(parts) == 2:
                    primary, sub, name = parts[0], '', parts[1]
                else:
                    primary, sub, name = parts[0], parts[1], '/'.join(parts[2:])
                
                # Check if file exists in database
                existing_file = self.db.get_file_by_path(primary, sub, name)
                if not existing_file:
                    # Check if file exists on filesystem
                    target_file_path = self.db.get_full_filesystem_path({
                        'primary_folder': primary,
                        'sub_folder': sub,
                        'file_name': name
                    }, str(self.base_path))
                    
                    file_exists = os.path.exists(target_file_path)
                    
                    # Add to database
                    self.db.add_file(primary, sub, name, 'new', 0, 0, 0, int(file_exists), 0, 0, 0,
                                    f'Auto-discovered from {self.db.get_file_path(file_record)}')
                    new_files_added += 1
                    print(f"  + Added new file: {include_path} (exists: {file_exists})")
        
        # Mark file as scanned
        self.db.update_file_flags(file_id, links_scanned=1)
        
        if new_files_added > 0:
            print(f"  Added {new_files_added} new files to database")
        
        return len(includes)
    
    def scan_file_by_path(self, primary_folder: str, sub_folder: str, file_name: str) -> int:
        """
        Scan a file by path components and update links.
        
        Args:
            primary_folder: Primary folder name
            sub_folder: Sub folder name
            file_name: File name
        
        Returns:
            Number of links found
        """
        file_record = self.db.get_file_by_path(primary_folder, sub_folder, file_name)
        if not file_record:
            print(f"File not found in database: {primary_folder}/{sub_folder}/{file_name}")
            return 0
        
        return self.scan_file_by_id(file_record['id'])
    
    def scan_all_unscanned(self) -> Dict[str, int]:
        """
        Scan all files that haven't been scanned for links yet.
        
        Returns:
            Dictionary with scan statistics
        """
        unscanned_files = self.db.get_files_needing_links_scan()
        
        stats = {
            'files_scanned': 0,
            'total_links': 0,
            'errors': 0
        }
        
        print(f"📡 Scanning {len(unscanned_files)} files for links...")
        
        for i, file_record in enumerate(unscanned_files):
            file_path = self.db.get_file_path(file_record)
            
            try:
                links_found = self.scan_file_by_id(file_record['id'])
                stats['files_scanned'] += 1
                stats['total_links'] += links_found
                
                if (i + 1) % 10 == 0:
                    print(f"   Processed {i + 1}/{len(unscanned_files)} files...")
                
            except Exception as e:
                print(f"Error scanning {file_path}: {e}")
                stats['errors'] += 1
        
        return stats
    
    def scan_specific_files(self, file_ids: List[int]) -> Dict[str, int]:
        """
        Scan specific files by their database IDs.
        
        Args:
            file_ids: List of file IDs to scan
        
        Returns:
            Dictionary with scan statistics
        """
        stats = {
            'files_scanned': 0,
            'total_links': 0,
            'errors': 0
        }
        
        print(f"📡 Scanning {len(file_ids)} specific files for links...")
        
        for file_id in file_ids:
            try:
                links_found = self.scan_file_by_id(file_id)
                stats['files_scanned'] += 1
                stats['total_links'] += links_found
            except Exception as e:
                print(f"Error scanning file ID {file_id}: {e}")
                stats['errors'] += 1
        
        return stats
    
    def get_dependency_report(self, file_id: int) -> Dict:
        """
        Get dependency report for a specific file.
        
        Args:
            file_id: Database ID of file
        
        Returns:
            Dictionary with dependency information
        """
        file_record = self.db.get_file_by_id(file_id)
        if not file_record:
            return {}
        
        links = self.db.get_links_for_file(file_id)
        
        # Categorize dependencies
        dependencies = {
            'file_path': self.db.get_file_path(file_record),
            'total_dependencies': len(links),
            'dependencies': [],
            'missing_files': [],
            'external_dependencies': []
        }
        
        for link in links:
            target_path = link['target_path']
            
            # Check if target exists in database
            if target_path.startswith('/intranet/'):
                rel_path = target_path[10:]  # Remove /intranet/
                parts = rel_path.split('/')
                
                if len(parts) == 1:
                    primary, sub, name = '', '', parts[0]
                elif len(parts) == 2:
                    primary, sub, name = parts[0], '', parts[1]
                else:
                    primary, sub, name = parts[0], parts[1], '/'.join(parts[2:])
                
                target_file = self.db.get_file_by_path(primary, sub, name)
                
                if target_file:
                    dependencies['dependencies'].append({
                        'path': target_path,
                        'status': target_file['status'],
                        'php8_ready': bool(target_file['php8_rewritten']),
                        'tested': bool(target_file['tested'])
                    })
                else:
                    dependencies['missing_files'].append(target_path)
            else:
                dependencies['external_dependencies'].append(target_path)
        
        return dependencies

def main():
    """Main scanning function."""
    import argparse
    
    parser = argparse.ArgumentParser(description='Scan PHP files for dependencies')
    parser.add_argument('--file-id', type=int, help='Scan specific file by database ID')
    parser.add_argument('--file-path', help='Scan specific file by path (format: primary/sub/file.php)')
    parser.add_argument('--all-unscanned', action='store_true', help='Scan all unscanned files')
    parser.add_argument('--rescan-all', action='store_true', help='Rescan all files (clear existing links)')
    parser.add_argument('--base-path', default='../vmserver10/intranet', help='Base path for PHP files')
    parser.add_argument('--report', type=int, help='Generate dependency report for file ID')
    parser.add_argument('--batch-size', type=int, default=50, help='Batch size for processing')
    
    args = parser.parse_args()
    
    print("🔗 PHP Migration Link Scanner")
    print("=" * 40)
    
    scanner = LinkScanner(args.base_path)
    
    if args.report:
        # Generate dependency report
        report = scanner.get_dependency_report(args.report)
        if report:
            print(f"\n📋 Dependency Report for {report['file_path']}:")
            print(f"   Total dependencies: {report['total_dependencies']}")
            
            if report['dependencies']:
                print(f"   Internal dependencies ({len(report['dependencies'])}):")
                for dep in report['dependencies'][:10]:  # Show first 10
                    status_icon = "✅" if dep['php8_ready'] and dep['tested'] else "⚠️"
                    print(f"     {status_icon} {dep['path']} ({dep['status']})")
                if len(report['dependencies']) > 10:
                    print(f"     ... and {len(report['dependencies']) - 10} more")
            
            if report['missing_files']:
                print(f"   Missing files ({len(report['missing_files'])}):")
                for missing in report['missing_files'][:5]:
                    print(f"     ❌ {missing}")
                if len(report['missing_files']) > 5:
                    print(f"     ... and {len(report['missing_files']) - 5} more")
            
            if report['external_dependencies']:
                print(f"   External dependencies ({len(report['external_dependencies'])}):")
                for ext in report['external_dependencies'][:5]:
                    print(f"     🔗 {ext}")
        else:
            print(f"File ID {args.report} not found")
        return
    
    if args.file_id:
        # Scan specific file by ID
        links_found = scanner.scan_file_by_id(args.file_id)
        print(f"✅ Scanned file ID {args.file_id}: found {links_found} links")
    
    elif args.file_path:
        # Scan specific file by path
        parts = args.file_path.split('/')
        if len(parts) == 1:
            primary, sub, name = '', '', parts[0]
        elif len(parts) == 2:
            primary, sub, name = parts[0], '', parts[1]
        else:
            primary, sub, name = parts[0], parts[1], '/'.join(parts[2:])
        
        links_found = scanner.scan_file_by_path(primary, sub, name)
        print(f"✅ Scanned {args.file_path}: found {links_found} links")
    
    elif args.all_unscanned:
        # Scan all unscanned files
        stats = scanner.scan_all_unscanned()
        print(f"✅ Scan complete:")
        print(f"   Files scanned: {stats['files_scanned']}")
        print(f"   Total links found: {stats['total_links']}")
        print(f"   Errors: {stats['errors']}")
    
    elif args.rescan_all:
        # Rescan all files
        all_files = scanner.db.get_all_files()
        file_ids = [f['id'] for f in all_files]
        
        # Reset links_scanned flag for all files
        for file_id in file_ids:
            scanner.db.update_file_flags(file_id, links_scanned=0)
        
        stats = scanner.scan_all_unscanned()
        print(f"✅ Rescan complete:")
        print(f"   Files rescanned: {stats['files_scanned']}")
        print(f"   Total links found: {stats['total_links']}")
        print(f"   Errors: {stats['errors']}")
    
    else:
        print("Please specify an action:")
        print("  --file-id ID          Scan specific file")
        print("  --file-path PATH      Scan file by path")
        print("  --all-unscanned       Scan all unscanned files")
        print("  --rescan-all          Rescan all files")
        print("  --report ID           Generate dependency report")
    
    # Show final statistics
    db_stats = scanner.db.get_statistics()
    print(f"\n📊 Database Statistics:")
    print(f"   Total files: {db_stats['total_files']}")
    print(f"   Links scanned: {db_stats['links_scanned']}")
    print(f"   Total links: {db_stats['total_links']}")

if __name__ == '__main__':
    main()