import csv import os import re from collections import defaultdict import logging import time logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') def sanitize_filename(filename, max_length=255): filename = re.sub(r'[<>:"/\\|?*]', '_', filename) filename = filename.strip('. ') return filename[:max_length - 4] def split_csv(input_file, output_dir, column_index, max_open_files=1000, file_idle_time=300): os.makedirs(output_dir, exist_ok=True) file_handles = {} file_usage = defaultdict(int) last_use_time = {} current_time = time.time() try: with open(input_file, 'r', newline='', encoding='utf-8') as csvfile: reader = csv.reader(csvfile) header = next(reader) for row_num, row in enumerate(reader, start=2): if len(row) <= column_index: logging.warning(f"Row {row_num} does not have enough columns. Skipping.") continue key = sanitize_filename(row[column_index]) file_usage[key] += 1 current_time = time.time() if key not in file_handles: # Close idle files if we're at the limit while len(file_handles) >= max_open_files: idle_key = min(last_use_time, key=lambda k: last_use_time[k]) if current_time - last_use_time[idle_key] > file_idle_time: file_handles[idle_key].close() del file_handles[idle_key] del last_use_time[idle_key] else: # If no files are idle, close the least recently used one lru_key = min(last_use_time, key=last_use_time.get) file_handles[lru_key].close() del file_handles[lru_key] del last_use_time[lru_key] output_file = os.path.join(output_dir, f"{key}.csv") file_handles[key] = open(output_file, 'a', newline='', encoding='utf-8') if os.path.getsize(output_file) == 0: writer = csv.writer(file_handles[key]) writer.writerow(header) writer = csv.writer(file_handles[key]) writer.writerow(row) last_use_time[key] = current_time # Periodically close idle files if row_num % 10000 == 0: for idle_key in list(last_use_time.keys()): if current_time - last_use_time[idle_key] > file_idle_time: if idle_key in file_handles: file_handles[idle_key].close() del file_handles[idle_key] del last_use_time[idle_key] except Exception as e: logging.error(f"An error occurred: {str(e)}") raise finally: for key, handle in file_handles.items(): try: handle.close() except Exception as e: logging.error(f"Error closing file for key '{key}': {str(e)}") logging.info(f"CSV splitting completed successfully. Total unique categories: {len(file_usage)}") # Usage input_file = 'your_large_file.csv' output_dir = 'splitted' column_index = 17 # Python uses 0-based indexing, so column 18 is index 17 try: split_csv(input_file, output_dir, column_index) except Exception as e: logging.error(f"Script execution failed: {str(e)}")