sort.sh
· 465 B · Bash
原始檔案
#!/bin/bash
find . -maxdepth 1 -name "*.csv" | while read file; do
filename=$(basename "$file")
if [[ "$filename" == *"_"* ]]; then
# File has at least one underscore
main_category=$(echo "$filename" | cut -d'_' -f1)
else
# File has no underscore, it is itself a main category
main_category="${filename%.csv}"
fi
mkdir -p "$main_category"
mv "$file" "$main_category/"
done
echo "File organization complete!"
1 | #!/bin/bash |
2 | |
3 | find . -maxdepth 1 -name "*.csv" | while read file; do |
4 | filename=$(basename "$file") |
5 | if [[ "$filename" == *"_"* ]]; then |
6 | # File has at least one underscore |
7 | main_category=$(echo "$filename" | cut -d'_' -f1) |
8 | else |
9 | # File has no underscore, it is itself a main category |
10 | main_category="${filename%.csv}" |
11 | fi |
12 | mkdir -p "$main_category" |
13 | mv "$file" "$main_category/" |
14 | done |
15 | |
16 | echo "File organization complete!" |
split.py
· 3.6 KiB · Python
原始檔案
import csv
import os
import re
from collections import defaultdict
import logging
import time
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def sanitize_filename(filename, max_length=255):
filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
filename = filename.strip('. ')
return filename[:max_length - 4]
def split_csv(input_file, output_dir, column_index, max_open_files=1000, file_idle_time=300):
os.makedirs(output_dir, exist_ok=True)
file_handles = {}
file_usage = defaultdict(int)
last_use_time = {}
current_time = time.time()
try:
with open(input_file, 'r', newline='', encoding='utf-8') as csvfile:
reader = csv.reader(csvfile)
header = next(reader)
for row_num, row in enumerate(reader, start=2):
if len(row) <= column_index:
logging.warning(f"Row {row_num} does not have enough columns. Skipping.")
continue
key = sanitize_filename(row[column_index])
file_usage[key] += 1
current_time = time.time()
if key not in file_handles:
# Close idle files if we're at the limit
while len(file_handles) >= max_open_files:
idle_key = min(last_use_time, key=lambda k: last_use_time[k])
if current_time - last_use_time[idle_key] > file_idle_time:
file_handles[idle_key].close()
del file_handles[idle_key]
del last_use_time[idle_key]
else:
# If no files are idle, close the least recently used one
lru_key = min(last_use_time, key=last_use_time.get)
file_handles[lru_key].close()
del file_handles[lru_key]
del last_use_time[lru_key]
output_file = os.path.join(output_dir, f"{key}.csv")
file_handles[key] = open(output_file, 'a', newline='', encoding='utf-8')
if os.path.getsize(output_file) == 0:
writer = csv.writer(file_handles[key])
writer.writerow(header)
writer = csv.writer(file_handles[key])
writer.writerow(row)
last_use_time[key] = current_time
# Periodically close idle files
if row_num % 10000 == 0:
for idle_key in list(last_use_time.keys()):
if current_time - last_use_time[idle_key] > file_idle_time:
if idle_key in file_handles:
file_handles[idle_key].close()
del file_handles[idle_key]
del last_use_time[idle_key]
except Exception as e:
logging.error(f"An error occurred: {str(e)}")
raise
finally:
for key, handle in file_handles.items():
try:
handle.close()
except Exception as e:
logging.error(f"Error closing file for key '{key}': {str(e)}")
logging.info(f"CSV splitting completed successfully. Total unique categories: {len(file_usage)}")
# Usage
input_file = 'your_large_file.csv'
output_dir = 'splitted'
column_index = 17 # Python uses 0-based indexing, so column 18 is index 17
try:
split_csv(input_file, output_dir, column_index)
except Exception as e:
logging.error(f"Script execution failed: {str(e)}")
1 | import csv |
2 | import os |
3 | import re |
4 | from collections import defaultdict |
5 | import logging |
6 | import time |
7 | |
8 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
9 | |
10 | def sanitize_filename(filename, max_length=255): |
11 | filename = re.sub(r'[<>:"/\\|?*]', '_', filename) |
12 | filename = filename.strip('. ') |
13 | return filename[:max_length - 4] |
14 | |
15 | def split_csv(input_file, output_dir, column_index, max_open_files=1000, file_idle_time=300): |
16 | os.makedirs(output_dir, exist_ok=True) |
17 | file_handles = {} |
18 | file_usage = defaultdict(int) |
19 | last_use_time = {} |
20 | current_time = time.time() |
21 | |
22 | try: |
23 | with open(input_file, 'r', newline='', encoding='utf-8') as csvfile: |
24 | reader = csv.reader(csvfile) |
25 | header = next(reader) |
26 | |
27 | for row_num, row in enumerate(reader, start=2): |
28 | if len(row) <= column_index: |
29 | logging.warning(f"Row {row_num} does not have enough columns. Skipping.") |
30 | continue |
31 | |
32 | key = sanitize_filename(row[column_index]) |
33 | file_usage[key] += 1 |
34 | current_time = time.time() |
35 | |
36 | if key not in file_handles: |
37 | # Close idle files if we're at the limit |
38 | while len(file_handles) >= max_open_files: |
39 | idle_key = min(last_use_time, key=lambda k: last_use_time[k]) |
40 | if current_time - last_use_time[idle_key] > file_idle_time: |
41 | file_handles[idle_key].close() |
42 | del file_handles[idle_key] |
43 | del last_use_time[idle_key] |
44 | else: |
45 | # If no files are idle, close the least recently used one |
46 | lru_key = min(last_use_time, key=last_use_time.get) |
47 | file_handles[lru_key].close() |
48 | del file_handles[lru_key] |
49 | del last_use_time[lru_key] |
50 | |
51 | output_file = os.path.join(output_dir, f"{key}.csv") |
52 | file_handles[key] = open(output_file, 'a', newline='', encoding='utf-8') |
53 | if os.path.getsize(output_file) == 0: |
54 | writer = csv.writer(file_handles[key]) |
55 | writer.writerow(header) |
56 | |
57 | writer = csv.writer(file_handles[key]) |
58 | writer.writerow(row) |
59 | last_use_time[key] = current_time |
60 | |
61 | # Periodically close idle files |
62 | if row_num % 10000 == 0: |
63 | for idle_key in list(last_use_time.keys()): |
64 | if current_time - last_use_time[idle_key] > file_idle_time: |
65 | if idle_key in file_handles: |
66 | file_handles[idle_key].close() |
67 | del file_handles[idle_key] |
68 | del last_use_time[idle_key] |
69 | |
70 | except Exception as e: |
71 | logging.error(f"An error occurred: {str(e)}") |
72 | raise |
73 | |
74 | finally: |
75 | for key, handle in file_handles.items(): |
76 | try: |
77 | handle.close() |
78 | except Exception as e: |
79 | logging.error(f"Error closing file for key '{key}': {str(e)}") |
80 | |
81 | logging.info(f"CSV splitting completed successfully. Total unique categories: {len(file_usage)}") |
82 | |
83 | # Usage |
84 | input_file = 'your_large_file.csv' |
85 | output_dir = 'splitted' |
86 | column_index = 17 # Python uses 0-based indexing, so column 18 is index 17 |
87 | |
88 | try: |
89 | split_csv(input_file, output_dir, column_index) |
90 | except Exception as e: |
91 | logging.error(f"Script execution failed: {str(e)}") |