Última atividade 1725600722

Malin's Avatar Malin revisou este gist 1725600721. Ir para a revisão

2 files changed, 107 insertions

sort.sh(arquivo criado)

@@ -0,0 +1,16 @@
1 + #!/bin/bash
2 +
3 + find . -maxdepth 1 -name "*.csv" | while read file; do
4 + filename=$(basename "$file")
5 + if [[ "$filename" == *"_"* ]]; then
6 + # File has at least one underscore
7 + main_category=$(echo "$filename" | cut -d'_' -f1)
8 + else
9 + # File has no underscore, it is itself a main category
10 + main_category="${filename%.csv}"
11 + fi
12 + mkdir -p "$main_category"
13 + mv "$file" "$main_category/"
14 + done
15 +
16 + echo "File organization complete!"

split.py(arquivo criado)

@@ -0,0 +1,91 @@
1 + import csv
2 + import os
3 + import re
4 + from collections import defaultdict
5 + import logging
6 + import time
7 +
8 + logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
9 +
10 + def sanitize_filename(filename, max_length=255):
11 + filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
12 + filename = filename.strip('. ')
13 + return filename[:max_length - 4]
14 +
15 + def split_csv(input_file, output_dir, column_index, max_open_files=1000, file_idle_time=300):
16 + os.makedirs(output_dir, exist_ok=True)
17 + file_handles = {}
18 + file_usage = defaultdict(int)
19 + last_use_time = {}
20 + current_time = time.time()
21 +
22 + try:
23 + with open(input_file, 'r', newline='', encoding='utf-8') as csvfile:
24 + reader = csv.reader(csvfile)
25 + header = next(reader)
26 +
27 + for row_num, row in enumerate(reader, start=2):
28 + if len(row) <= column_index:
29 + logging.warning(f"Row {row_num} does not have enough columns. Skipping.")
30 + continue
31 +
32 + key = sanitize_filename(row[column_index])
33 + file_usage[key] += 1
34 + current_time = time.time()
35 +
36 + if key not in file_handles:
37 + # Close idle files if we're at the limit
38 + while len(file_handles) >= max_open_files:
39 + idle_key = min(last_use_time, key=lambda k: last_use_time[k])
40 + if current_time - last_use_time[idle_key] > file_idle_time:
41 + file_handles[idle_key].close()
42 + del file_handles[idle_key]
43 + del last_use_time[idle_key]
44 + else:
45 + # If no files are idle, close the least recently used one
46 + lru_key = min(last_use_time, key=last_use_time.get)
47 + file_handles[lru_key].close()
48 + del file_handles[lru_key]
49 + del last_use_time[lru_key]
50 +
51 + output_file = os.path.join(output_dir, f"{key}.csv")
52 + file_handles[key] = open(output_file, 'a', newline='', encoding='utf-8')
53 + if os.path.getsize(output_file) == 0:
54 + writer = csv.writer(file_handles[key])
55 + writer.writerow(header)
56 +
57 + writer = csv.writer(file_handles[key])
58 + writer.writerow(row)
59 + last_use_time[key] = current_time
60 +
61 + # Periodically close idle files
62 + if row_num % 10000 == 0:
63 + for idle_key in list(last_use_time.keys()):
64 + if current_time - last_use_time[idle_key] > file_idle_time:
65 + if idle_key in file_handles:
66 + file_handles[idle_key].close()
67 + del file_handles[idle_key]
68 + del last_use_time[idle_key]
69 +
70 + except Exception as e:
71 + logging.error(f"An error occurred: {str(e)}")
72 + raise
73 +
74 + finally:
75 + for key, handle in file_handles.items():
76 + try:
77 + handle.close()
78 + except Exception as e:
79 + logging.error(f"Error closing file for key '{key}': {str(e)}")
80 +
81 + logging.info(f"CSV splitting completed successfully. Total unique categories: {len(file_usage)}")
82 +
83 + # Usage
84 + input_file = 'your_large_file.csv'
85 + output_dir = 'splitted'
86 + column_index = 17 # Python uses 0-based indexing, so column 18 is index 17
87 +
88 + try:
89 + split_csv(input_file, output_dir, column_index)
90 + except Exception as e:
91 + logging.error(f"Script execution failed: {str(e)}")
Próximo Anterior