#!/usr/bin/env python3
"""
Cleanup script for duplicate file entries in the migration database.
Removes duplicates caused by path normalization issues.
"""

import os
from typing import List, Dict, Tuple
from db_utils import MigrationDB

def find_duplicate_files() -> List[Tuple[str, List[Dict]]]:
    """Find files that appear multiple times with different path formats."""
    db = MigrationDB()
    all_files = db.get_all_files()
    
    # Group files by normalized path
    path_groups = {}
    
    for file_record in all_files:
        # Create normalized key
        primary = file_record['primary_folder'] or ''
        sub = file_record['sub_folder'] or ''
        name = file_record['file_name']
        
        # Normalize the path components
        normalized_key = f"{primary.strip('/')}/{sub.strip('/')}/{name}".replace('//', '/').strip('/')
        
        if normalized_key not in path_groups:
            path_groups[normalized_key] = []
        path_groups[normalized_key].append(file_record)
    
    # Find groups with multiple entries
    duplicates = []
    for normalized_path, files in path_groups.items():
        if len(files) > 1:
            duplicates.append((normalized_path, files))
    
    return duplicates

def find_server_path_duplicates() -> List[Tuple[str, List[Dict]]]:
    """Find files with server paths that duplicate existing relative paths."""
    db = MigrationDB()
    
    # Get all files
    with db.get_connection() as conn:
        cursor = conn.execute('SELECT * FROM files ORDER BY primary_folder, sub_folder, file_name')
        all_files = [dict(row) for row in cursor.fetchall()]
    
    duplicates = []
    seen_paths = {}
    
    for file_record in all_files:
        path = db.get_file_path(file_record)
        
        # Check for server path patterns
        if 'var/www/vmserver10/intranet/' in path or '/vmserver10/intranet/' in path:
            # Extract the relative part
            if 'var/www/vmserver10/intranet/' in path:
                rel_part = path.split('var/www/vmserver10/intranet/')[-1]
            else:
                rel_part = path.split('/vmserver10/intranet/')[-1]
            
            # Check if we already have this relative path
            for existing_path, existing_files in seen_paths.items():
                if existing_path.endswith(rel_part) or rel_part in existing_path:
                    # Found potential duplicate
                    duplicates.append((rel_part, [file_record] + existing_files))
                    break
        
        # Add to seen paths
        if path not in seen_paths:
            seen_paths[path] = []
        seen_paths[path].append(file_record)
    
    return duplicates

def remove_server_path_duplicates(dry_run: bool = True) -> int:
    """Remove duplicate entries with server paths, keeping the relative ones."""
    db = MigrationDB()
    removed_count = 0
    
    print("🔍 Looking for server path duplicates...")
    
    # Get all files with server paths
    with db.get_connection() as conn:
        cursor = conn.execute('''
            SELECT * FROM files 
            WHERE primary_folder LIKE '%var/www%' 
               OR primary_folder LIKE '%vmserver10%'
               OR sub_folder LIKE '%var/www%'
               OR sub_folder LIKE '%vmserver10%'
               OR file_name LIKE '%var/www%'
               OR file_name LIKE '%vmserver10%'
        ''')
        server_path_files = [dict(row) for row in cursor.fetchall()]
    
    for file_record in server_path_files:
        path = db.get_file_path(file_record)
        print(f"Found server path: {path}")
        
        # Extract relative path
        rel_path = None
        if 'var/www/vmserver10/intranet/' in path:
            rel_path = path.split('var/www/vmserver10/intranet/')[-1]
        elif '/vmserver10/intranet/' in path:
            rel_path = path.split('/vmserver10/intranet/')[-1]
        
        if rel_path:
            # Parse relative path into components
            parts = rel_path.split('/')
            if len(parts) == 1:
                target_primary, target_sub, target_name = '', '', parts[0]
            elif len(parts) == 2:
                target_primary, target_sub, target_name = parts[0], '', parts[1]
            else:
                target_primary, target_sub, target_name = parts[0], parts[1], '/'.join(parts[2:])
            
            # Check if relative version exists
            relative_file = db.get_file_by_path(target_primary, target_sub, target_name)
            
            if relative_file:
                print(f"  → Found relative version: {db.get_file_path(relative_file)}")
                
                if not dry_run:
                    # Remove the server path version
                    with db.get_connection() as conn:
                        conn.execute('DELETE FROM files WHERE id = ?', (file_record['id'],))
                    removed_count += 1
                    print(f"  → Removed server path duplicate")
                else:
                    print(f"  → Would remove server path duplicate")
                    removed_count += 1
    
    return removed_count

def normalize_all_paths(dry_run: bool = True) -> int:
    """Normalize all paths in the database to remove server path prefixes."""
    db = MigrationDB()
    updated_count = 0
    
    print("🔧 Normalizing all file paths...")
    
    all_files = db.get_all_files()
    
    for file_record in all_files:
        original_path = db.get_file_path(file_record)
        needs_update = False
        
        # Check each component for server paths
        primary = file_record['primary_folder'] or ''
        sub = file_record['sub_folder'] or ''
        name = file_record['file_name']
        
        # Clean primary folder
        if 'var/www/vmserver10/intranet/' in primary:
            primary = primary.split('var/www/vmserver10/intranet/')[-1]
            needs_update = True
        elif '/vmserver10/intranet/' in primary:
            primary = primary.split('/vmserver10/intranet/')[-1]
            needs_update = True
        
        # Clean sub folder
        if 'var/www/vmserver10/intranet/' in sub:
            sub = sub.split('var/www/vmserver10/intranet/')[-1]
            needs_update = True
        elif '/vmserver10/intranet/' in sub:
            sub = sub.split('/vmserver10/intranet/')[-1]
            needs_update = True
        
        # Clean file name
        if 'var/www/vmserver10/intranet/' in name:
            name = name.split('var/www/vmserver10/intranet/')[-1]
            needs_update = True
        elif '/vmserver10/intranet/' in name:
            name = name.split('/vmserver10/intranet/')[-1]
            needs_update = True
        
        if needs_update:
            new_path = f"{primary}/{sub}/{name}".replace('//', '/').strip('/')
            print(f"Normalizing: {original_path} → {new_path}")
            
            if not dry_run:
                # Update the record
                with db.get_connection() as conn:
                    conn.execute('''
                        UPDATE files 
                        SET primary_folder = ?, sub_folder = ?, file_name = ?, last_updated = ?
                        WHERE id = ?
                    ''', (primary, sub, name, file_record['last_updated'], file_record['id']))
                updated_count += 1
            else:
                updated_count += 1
    
    return updated_count

def main():
    """Main cleanup function."""
    import argparse
    
    parser = argparse.ArgumentParser(description='Clean up duplicate and malformed file paths')
    parser.add_argument('--find-duplicates', action='store_true',
                       help='Find and list duplicate files')
    parser.add_argument('--remove-server-duplicates', action='store_true',
                       help='Remove duplicate entries with server paths')
    parser.add_argument('--normalize-paths', action='store_true',
                       help='Normalize all paths to remove server prefixes')
    parser.add_argument('--dry-run', action='store_true', default=True,
                       help='Show what would be done without making changes (default)')
    parser.add_argument('--execute', action='store_true',
                       help='Actually perform the cleanup (overrides --dry-run)')
    
    args = parser.parse_args()
    
    print("🧹 PHP Migration Database Cleanup")
    print("=" * 40)
    
    # Override dry_run if execute is specified
    dry_run = args.dry_run and not args.execute
    
    if dry_run:
        print("🔍 DRY RUN MODE - No changes will be made")
        print("Use --execute to actually perform cleanup")
        print()
    
    if args.find_duplicates:
        duplicates = find_duplicate_files()
        if duplicates:
            print(f"Found {len(duplicates)} sets of duplicate files:")
            for normalized_path, files in duplicates:
                print(f"\n📁 {normalized_path} ({len(files)} entries):")
                for file_record in files:
                    db = MigrationDB()
                    path = db.get_file_path(file_record)
                    print(f"   - ID {file_record['id']}: {path}")
        else:
            print("✅ No duplicate files found")
    
    if args.remove_server_duplicates:
        removed = remove_server_path_duplicates(dry_run)
        if dry_run:
            print(f"Would remove {removed} server path duplicates")
        else:
            print(f"✅ Removed {removed} server path duplicates")
    
    if args.normalize_paths:
        updated = normalize_all_paths(dry_run)
        if dry_run:
            print(f"Would normalize {updated} file paths")
        else:
            print(f"✅ Normalized {updated} file paths")
    
    if not any([args.find_duplicates, args.remove_server_duplicates, args.normalize_paths]):
        print("Please specify an action:")
        print("  --find-duplicates        Find duplicate file entries")
        print("  --remove-server-duplicates  Remove server path duplicates")
        print("  --normalize-paths        Normalize all paths")
        print("  --execute                Actually perform changes")

if __name__ == '__main__':
    main()