sort.sh
· 465 B · Bash
Исходник
#!/bin/bash
find . -maxdepth 1 -name "*.csv" | while read file; do
filename=$(basename "$file")
if [[ "$filename" == *"_"* ]]; then
# File has at least one underscore
main_category=$(echo "$filename" | cut -d'_' -f1)
else
# File has no underscore, it is itself a main category
main_category="${filename%.csv}"
fi
mkdir -p "$main_category"
mv "$file" "$main_category/"
done
echo "File organization complete!"
| 1 | #!/bin/bash |
| 2 | |
| 3 | find . -maxdepth 1 -name "*.csv" | while read file; do |
| 4 | filename=$(basename "$file") |
| 5 | if [[ "$filename" == *"_"* ]]; then |
| 6 | # File has at least one underscore |
| 7 | main_category=$(echo "$filename" | cut -d'_' -f1) |
| 8 | else |
| 9 | # File has no underscore, it is itself a main category |
| 10 | main_category="${filename%.csv}" |
| 11 | fi |
| 12 | mkdir -p "$main_category" |
| 13 | mv "$file" "$main_category/" |
| 14 | done |
| 15 | |
| 16 | echo "File organization complete!" |
split.py
· 3.6 KiB · Python
Исходник
import csv
import os
import re
from collections import defaultdict
import logging
import time
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def sanitize_filename(filename, max_length=255):
filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
filename = filename.strip('. ')
return filename[:max_length - 4]
def split_csv(input_file, output_dir, column_index, max_open_files=1000, file_idle_time=300):
os.makedirs(output_dir, exist_ok=True)
file_handles = {}
file_usage = defaultdict(int)
last_use_time = {}
current_time = time.time()
try:
with open(input_file, 'r', newline='', encoding='utf-8') as csvfile:
reader = csv.reader(csvfile)
header = next(reader)
for row_num, row in enumerate(reader, start=2):
if len(row) <= column_index:
logging.warning(f"Row {row_num} does not have enough columns. Skipping.")
continue
key = sanitize_filename(row[column_index])
file_usage[key] += 1
current_time = time.time()
if key not in file_handles:
# Close idle files if we're at the limit
while len(file_handles) >= max_open_files:
idle_key = min(last_use_time, key=lambda k: last_use_time[k])
if current_time - last_use_time[idle_key] > file_idle_time:
file_handles[idle_key].close()
del file_handles[idle_key]
del last_use_time[idle_key]
else:
# If no files are idle, close the least recently used one
lru_key = min(last_use_time, key=last_use_time.get)
file_handles[lru_key].close()
del file_handles[lru_key]
del last_use_time[lru_key]
output_file = os.path.join(output_dir, f"{key}.csv")
file_handles[key] = open(output_file, 'a', newline='', encoding='utf-8')
if os.path.getsize(output_file) == 0:
writer = csv.writer(file_handles[key])
writer.writerow(header)
writer = csv.writer(file_handles[key])
writer.writerow(row)
last_use_time[key] = current_time
# Periodically close idle files
if row_num % 10000 == 0:
for idle_key in list(last_use_time.keys()):
if current_time - last_use_time[idle_key] > file_idle_time:
if idle_key in file_handles:
file_handles[idle_key].close()
del file_handles[idle_key]
del last_use_time[idle_key]
except Exception as e:
logging.error(f"An error occurred: {str(e)}")
raise
finally:
for key, handle in file_handles.items():
try:
handle.close()
except Exception as e:
logging.error(f"Error closing file for key '{key}': {str(e)}")
logging.info(f"CSV splitting completed successfully. Total unique categories: {len(file_usage)}")
# Usage
input_file = 'your_large_file.csv'
output_dir = 'splitted'
column_index = 17 # Python uses 0-based indexing, so column 18 is index 17
try:
split_csv(input_file, output_dir, column_index)
except Exception as e:
logging.error(f"Script execution failed: {str(e)}")
| 1 | import csv |
| 2 | import os |
| 3 | import re |
| 4 | from collections import defaultdict |
| 5 | import logging |
| 6 | import time |
| 7 | |
| 8 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
| 9 | |
| 10 | def sanitize_filename(filename, max_length=255): |
| 11 | filename = re.sub(r'[<>:"/\\|?*]', '_', filename) |
| 12 | filename = filename.strip('. ') |
| 13 | return filename[:max_length - 4] |
| 14 | |
| 15 | def split_csv(input_file, output_dir, column_index, max_open_files=1000, file_idle_time=300): |
| 16 | os.makedirs(output_dir, exist_ok=True) |
| 17 | file_handles = {} |
| 18 | file_usage = defaultdict(int) |
| 19 | last_use_time = {} |
| 20 | current_time = time.time() |
| 21 | |
| 22 | try: |
| 23 | with open(input_file, 'r', newline='', encoding='utf-8') as csvfile: |
| 24 | reader = csv.reader(csvfile) |
| 25 | header = next(reader) |
| 26 | |
| 27 | for row_num, row in enumerate(reader, start=2): |
| 28 | if len(row) <= column_index: |
| 29 | logging.warning(f"Row {row_num} does not have enough columns. Skipping.") |
| 30 | continue |
| 31 | |
| 32 | key = sanitize_filename(row[column_index]) |
| 33 | file_usage[key] += 1 |
| 34 | current_time = time.time() |
| 35 | |
| 36 | if key not in file_handles: |
| 37 | # Close idle files if we're at the limit |
| 38 | while len(file_handles) >= max_open_files: |
| 39 | idle_key = min(last_use_time, key=lambda k: last_use_time[k]) |
| 40 | if current_time - last_use_time[idle_key] > file_idle_time: |
| 41 | file_handles[idle_key].close() |
| 42 | del file_handles[idle_key] |
| 43 | del last_use_time[idle_key] |
| 44 | else: |
| 45 | # If no files are idle, close the least recently used one |
| 46 | lru_key = min(last_use_time, key=last_use_time.get) |
| 47 | file_handles[lru_key].close() |
| 48 | del file_handles[lru_key] |
| 49 | del last_use_time[lru_key] |
| 50 | |
| 51 | output_file = os.path.join(output_dir, f"{key}.csv") |
| 52 | file_handles[key] = open(output_file, 'a', newline='', encoding='utf-8') |
| 53 | if os.path.getsize(output_file) == 0: |
| 54 | writer = csv.writer(file_handles[key]) |
| 55 | writer.writerow(header) |
| 56 | |
| 57 | writer = csv.writer(file_handles[key]) |
| 58 | writer.writerow(row) |
| 59 | last_use_time[key] = current_time |
| 60 | |
| 61 | # Periodically close idle files |
| 62 | if row_num % 10000 == 0: |
| 63 | for idle_key in list(last_use_time.keys()): |
| 64 | if current_time - last_use_time[idle_key] > file_idle_time: |
| 65 | if idle_key in file_handles: |
| 66 | file_handles[idle_key].close() |
| 67 | del file_handles[idle_key] |
| 68 | del last_use_time[idle_key] |
| 69 | |
| 70 | except Exception as e: |
| 71 | logging.error(f"An error occurred: {str(e)}") |
| 72 | raise |
| 73 | |
| 74 | finally: |
| 75 | for key, handle in file_handles.items(): |
| 76 | try: |
| 77 | handle.close() |
| 78 | except Exception as e: |
| 79 | logging.error(f"Error closing file for key '{key}': {str(e)}") |
| 80 | |
| 81 | logging.info(f"CSV splitting completed successfully. Total unique categories: {len(file_usage)}") |
| 82 | |
| 83 | # Usage |
| 84 | input_file = 'your_large_file.csv' |
| 85 | output_dir = 'splitted' |
| 86 | column_index = 17 # Python uses 0-based indexing, so column 18 is index 17 |
| 87 | |
| 88 | try: |
| 89 | split_csv(input_file, output_dir, column_index) |
| 90 | except Exception as e: |
| 91 | logging.error(f"Script execution failed: {str(e)}") |