Malin revisou este gist . Ir para a revisão
2 files changed, 107 insertions
sort.sh(arquivo criado)
@@ -0,0 +1,16 @@ | |||
1 | + | #!/bin/bash | |
2 | + | ||
3 | + | find . -maxdepth 1 -name "*.csv" | while read file; do | |
4 | + | filename=$(basename "$file") | |
5 | + | if [[ "$filename" == *"_"* ]]; then | |
6 | + | # File has at least one underscore | |
7 | + | main_category=$(echo "$filename" | cut -d'_' -f1) | |
8 | + | else | |
9 | + | # File has no underscore, it is itself a main category | |
10 | + | main_category="${filename%.csv}" | |
11 | + | fi | |
12 | + | mkdir -p "$main_category" | |
13 | + | mv "$file" "$main_category/" | |
14 | + | done | |
15 | + | ||
16 | + | echo "File organization complete!" |
split.py(arquivo criado)
@@ -0,0 +1,91 @@ | |||
1 | + | import csv | |
2 | + | import os | |
3 | + | import re | |
4 | + | from collections import defaultdict | |
5 | + | import logging | |
6 | + | import time | |
7 | + | ||
8 | + | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
9 | + | ||
10 | + | def sanitize_filename(filename, max_length=255): | |
11 | + | filename = re.sub(r'[<>:"/\\|?*]', '_', filename) | |
12 | + | filename = filename.strip('. ') | |
13 | + | return filename[:max_length - 4] | |
14 | + | ||
15 | + | def split_csv(input_file, output_dir, column_index, max_open_files=1000, file_idle_time=300): | |
16 | + | os.makedirs(output_dir, exist_ok=True) | |
17 | + | file_handles = {} | |
18 | + | file_usage = defaultdict(int) | |
19 | + | last_use_time = {} | |
20 | + | current_time = time.time() | |
21 | + | ||
22 | + | try: | |
23 | + | with open(input_file, 'r', newline='', encoding='utf-8') as csvfile: | |
24 | + | reader = csv.reader(csvfile) | |
25 | + | header = next(reader) | |
26 | + | ||
27 | + | for row_num, row in enumerate(reader, start=2): | |
28 | + | if len(row) <= column_index: | |
29 | + | logging.warning(f"Row {row_num} does not have enough columns. Skipping.") | |
30 | + | continue | |
31 | + | ||
32 | + | key = sanitize_filename(row[column_index]) | |
33 | + | file_usage[key] += 1 | |
34 | + | current_time = time.time() | |
35 | + | ||
36 | + | if key not in file_handles: | |
37 | + | # Close idle files if we're at the limit | |
38 | + | while len(file_handles) >= max_open_files: | |
39 | + | idle_key = min(last_use_time, key=lambda k: last_use_time[k]) | |
40 | + | if current_time - last_use_time[idle_key] > file_idle_time: | |
41 | + | file_handles[idle_key].close() | |
42 | + | del file_handles[idle_key] | |
43 | + | del last_use_time[idle_key] | |
44 | + | else: | |
45 | + | # If no files are idle, close the least recently used one | |
46 | + | lru_key = min(last_use_time, key=last_use_time.get) | |
47 | + | file_handles[lru_key].close() | |
48 | + | del file_handles[lru_key] | |
49 | + | del last_use_time[lru_key] | |
50 | + | ||
51 | + | output_file = os.path.join(output_dir, f"{key}.csv") | |
52 | + | file_handles[key] = open(output_file, 'a', newline='', encoding='utf-8') | |
53 | + | if os.path.getsize(output_file) == 0: | |
54 | + | writer = csv.writer(file_handles[key]) | |
55 | + | writer.writerow(header) | |
56 | + | ||
57 | + | writer = csv.writer(file_handles[key]) | |
58 | + | writer.writerow(row) | |
59 | + | last_use_time[key] = current_time | |
60 | + | ||
61 | + | # Periodically close idle files | |
62 | + | if row_num % 10000 == 0: | |
63 | + | for idle_key in list(last_use_time.keys()): | |
64 | + | if current_time - last_use_time[idle_key] > file_idle_time: | |
65 | + | if idle_key in file_handles: | |
66 | + | file_handles[idle_key].close() | |
67 | + | del file_handles[idle_key] | |
68 | + | del last_use_time[idle_key] | |
69 | + | ||
70 | + | except Exception as e: | |
71 | + | logging.error(f"An error occurred: {str(e)}") | |
72 | + | raise | |
73 | + | ||
74 | + | finally: | |
75 | + | for key, handle in file_handles.items(): | |
76 | + | try: | |
77 | + | handle.close() | |
78 | + | except Exception as e: | |
79 | + | logging.error(f"Error closing file for key '{key}': {str(e)}") | |
80 | + | ||
81 | + | logging.info(f"CSV splitting completed successfully. Total unique categories: {len(file_usage)}") | |
82 | + | ||
83 | + | # Usage | |
84 | + | input_file = 'your_large_file.csv' | |
85 | + | output_dir = 'splitted' | |
86 | + | column_index = 17 # Python uses 0-based indexing, so column 18 is index 17 | |
87 | + | ||
88 | + | try: | |
89 | + | split_csv(input_file, output_dir, column_index) | |
90 | + | except Exception as e: | |
91 | + | logging.error(f"Script execution failed: {str(e)}") |
Próximo
Anterior