Última atividade 1725600722

sort.sh Bruto
1#!/bin/bash
2
3find . -maxdepth 1 -name "*.csv" | while read file; do
4 filename=$(basename "$file")
5 if [[ "$filename" == *"_"* ]]; then
6 # File has at least one underscore
7 main_category=$(echo "$filename" | cut -d'_' -f1)
8 else
9 # File has no underscore, it is itself a main category
10 main_category="${filename%.csv}"
11 fi
12 mkdir -p "$main_category"
13 mv "$file" "$main_category/"
14done
15
16echo "File organization complete!"
split.py Bruto
1import csv
2import os
3import re
4from collections import defaultdict
5import logging
6import time
7
8logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
9
10def sanitize_filename(filename, max_length=255):
11 filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
12 filename = filename.strip('. ')
13 return filename[:max_length - 4]
14
15def split_csv(input_file, output_dir, column_index, max_open_files=1000, file_idle_time=300):
16 os.makedirs(output_dir, exist_ok=True)
17 file_handles = {}
18 file_usage = defaultdict(int)
19 last_use_time = {}
20 current_time = time.time()
21
22 try:
23 with open(input_file, 'r', newline='', encoding='utf-8') as csvfile:
24 reader = csv.reader(csvfile)
25 header = next(reader)
26
27 for row_num, row in enumerate(reader, start=2):
28 if len(row) <= column_index:
29 logging.warning(f"Row {row_num} does not have enough columns. Skipping.")
30 continue
31
32 key = sanitize_filename(row[column_index])
33 file_usage[key] += 1
34 current_time = time.time()
35
36 if key not in file_handles:
37 # Close idle files if we're at the limit
38 while len(file_handles) >= max_open_files:
39 idle_key = min(last_use_time, key=lambda k: last_use_time[k])
40 if current_time - last_use_time[idle_key] > file_idle_time:
41 file_handles[idle_key].close()
42 del file_handles[idle_key]
43 del last_use_time[idle_key]
44 else:
45 # If no files are idle, close the least recently used one
46 lru_key = min(last_use_time, key=last_use_time.get)
47 file_handles[lru_key].close()
48 del file_handles[lru_key]
49 del last_use_time[lru_key]
50
51 output_file = os.path.join(output_dir, f"{key}.csv")
52 file_handles[key] = open(output_file, 'a', newline='', encoding='utf-8')
53 if os.path.getsize(output_file) == 0:
54 writer = csv.writer(file_handles[key])
55 writer.writerow(header)
56
57 writer = csv.writer(file_handles[key])
58 writer.writerow(row)
59 last_use_time[key] = current_time
60
61 # Periodically close idle files
62 if row_num % 10000 == 0:
63 for idle_key in list(last_use_time.keys()):
64 if current_time - last_use_time[idle_key] > file_idle_time:
65 if idle_key in file_handles:
66 file_handles[idle_key].close()
67 del file_handles[idle_key]
68 del last_use_time[idle_key]
69
70 except Exception as e:
71 logging.error(f"An error occurred: {str(e)}")
72 raise
73
74 finally:
75 for key, handle in file_handles.items():
76 try:
77 handle.close()
78 except Exception as e:
79 logging.error(f"Error closing file for key '{key}': {str(e)}")
80
81 logging.info(f"CSV splitting completed successfully. Total unique categories: {len(file_usage)}")
82
83# Usage
84input_file = 'your_large_file.csv'
85output_dir = 'splitted'
86column_index = 17 # Python uses 0-based indexing, so column 18 is index 17
87
88try:
89 split_csv(input_file, output_dir, column_index)
90except Exception as e:
91 logging.error(f"Script execution failed: {str(e)}")