rip-imagebam.py - Opengist

rip-imagebam.py · 4.6 KiB · Python 原始檔案

# from: https://www.reddit.com/r/DataHoarder/comments/1d90f3c/nsfw_best_way_to_download_hundreds_of_separate/ import requests from bs4 import BeautifulSoup from datetime import datetime, timedelta import os from concurrent.futures import ThreadPoolExecutor, as_completed def fetch_webpage(url): response = requests.get(url) if response.status_code == 200: return response.text else: print(f"Failed to retrieve the webpage. Status code: {response.status_code}") return None def parse_galleries_from_root(html_content): soup = BeautifulSoup(html_content, 'html.parser') main_content = soup.find('div', class_='flex-container') if main_content: links = main_content.find_all('a') urls = [] for link in links: href = link.get('href') if href and 'https://bitch-show.com/gallery/' in href: urls.append(href) return urls def parse_imagehost_urls_from_gallery(html_content): soup = BeautifulSoup(html_content, 'html.parser') td_content = soup.find('div', class_='flex-container').find('td', class_='content') if td_content: links = td_content.find_all('a') urls = [] for link in links: href = link.get('href') if href and 'https://www.imagebam.com/view/' in href: start_index = href.find('https://www.imagebam.com/view/') if start_index != -1: full_url = href[start_index:] urls.append(full_url) return urls else: print("No `td` element with class `content` found.") return [] def get_cookie_expiration(): current_datetime = datetime.now() new_datetime = current_datetime + timedelta(hours=6) return new_datetime.strftime("%a, %d-%b-%Y %H:%M:%S GMT") def fetch_webpage_with_cookie(url): expiration = get_cookie_expiration() cookies = { 'nsfw_inter': '1', 'expires': expiration, 'path': '/' } with requests.Session() as session: response = session.get(url, cookies=cookies) if response.status_code == 200: return response.text else: print(f"Failed to retrieve the webpage. Status code: {response.status_code}") return None def parse_imagebam_page(html_content): soup = BeautifulSoup(html_content, 'html.parser') image_tag = soup.find('img', class_='main-image') if image_tag: image_url = image_tag.get('src') return image_url else: print("No image found on the page.") return None def download_image(url, save_dir): try: os.makedirs(save_dir, exist_ok=True) filename = url.split('/')[-1] save_path = os.path.join(save_dir, filename) response = requests.get(url) response.raise_for_status() if os.path.exists(save_path): print(f"{save_path} already exists.") else: with open(save_path, 'wb') as file: file.write(response.content) print(f"Image successfully downloaded and saved to {save_path}") except requests.exceptions.RequestException as e: print(f"Failed to download the image: {e}") def download_gallery_images(gallery_url, save_root): print(f'Trying to download gallery at {gallery_url}') html_content = fetch_webpage(gallery_url) if html_content: urls = parse_imagehost_urls_from_gallery(html_content) gallery_name = gallery_url.split('/')[-1] save_dir = f'{save_root}{gallery_name}' with ThreadPoolExecutor(max_workers=10) as executor: futures = [] for url in urls: futures.append(executor.submit(fetch_and_download_image, url, save_dir)) for future in as_completed(futures): future.result() def fetch_and_download_image(url, save_dir): try: imagebam_content = fetch_webpage_with_cookie(url) image_src = parse_imagebam_page(imagebam_content) if image_src: download_image(image_src, save_dir) except Exception as e: print(f"Failed to download the image: {e}") # Main function def main(): save_root = './' scrape_root = 'https://bitch-show.com/page/' for i in range(1, 501): page_url = f'{scrape_root}{i}' print(f'Working on Page {i} at {page_url}') page_html_content = fetch_webpage(page_url) gallery_urls = parse_galleries_from_root(page_html_content) for gallery_url in gallery_urls: download_gallery_images(gallery_url, save_root) if __name__ == "__main__": main()

1	# from: https://www.reddit.com/r/DataHoarder/comments/1d90f3c/nsfw_best_way_to_download_hundreds_of_separate/
2	import requests
3	from bs4 import BeautifulSoup
4	from datetime import datetime, timedelta
5	import os
6	from concurrent.futures import ThreadPoolExecutor, as_completed
7
8
9	def fetch_webpage(url):
10	response = requests.get(url)
11	if response.status_code == 200:
12	return response.text
13	else:
14	print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
15	return None
16
17
18	def parse_galleries_from_root(html_content):
19	soup = BeautifulSoup(html_content, 'html.parser')
20	main_content = soup.find('div', class_='flex-container')
21
22	if main_content:
23	links = main_content.find_all('a')
24
25	urls = []
26	for link in links:
27	href = link.get('href')
28	if href and 'https://bitch-show.com/gallery/' in href:
29	urls.append(href)
30	return urls
31
32
33	def parse_imagehost_urls_from_gallery(html_content):
34	soup = BeautifulSoup(html_content, 'html.parser')
35	td_content = soup.find('div', class_='flex-container').find('td', class_='content')
36	if td_content:
37	links = td_content.find_all('a')
38
39	urls = []
40	for link in links:
41	href = link.get('href')
42	if href and 'https://www.imagebam.com/view/' in href:
43	start_index = href.find('https://www.imagebam.com/view/')
44	if start_index != -1:
45	full_url = href[start_index:]
46	urls.append(full_url)
47	return urls
48
49	else:
50	print("No `td` element with class `content` found.")
51	return []
52
53
54	def get_cookie_expiration():
55	current_datetime = datetime.now()
56	new_datetime = current_datetime + timedelta(hours=6)
57	return new_datetime.strftime("%a, %d-%b-%Y %H:%M:%S GMT")
58
59
60	def fetch_webpage_with_cookie(url):
61	expiration = get_cookie_expiration()
62	cookies = {
63	'nsfw_inter': '1',
64	'expires': expiration,
65	'path': '/'
66	}
67
68	with requests.Session() as session:
69	response = session.get(url, cookies=cookies)
70	if response.status_code == 200:
71	return response.text
72	else:
73	print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
74	return None
75
76
77	def parse_imagebam_page(html_content):
78	soup = BeautifulSoup(html_content, 'html.parser')
79	image_tag = soup.find('img', class_='main-image')
80	if image_tag:
81	image_url = image_tag.get('src')
82	return image_url
83	else:
84	print("No image found on the page.")
85	return None
86
87
88	def download_image(url, save_dir):
89	try:
90	os.makedirs(save_dir, exist_ok=True)
91	filename = url.split('/')[-1]
92	save_path = os.path.join(save_dir, filename)
93
94	response = requests.get(url)
95	response.raise_for_status()
96
97	if os.path.exists(save_path):
98	print(f"{save_path} already exists.")
99	else:
100	with open(save_path, 'wb') as file:
101	file.write(response.content)
102	print(f"Image successfully downloaded and saved to {save_path}")
103	except requests.exceptions.RequestException as e:
104	print(f"Failed to download the image: {e}")
105
106
107	def download_gallery_images(gallery_url, save_root):
108	print(f'Trying to download gallery at {gallery_url}')
109	html_content = fetch_webpage(gallery_url)
110	if html_content:
111	urls = parse_imagehost_urls_from_gallery(html_content)
112	gallery_name = gallery_url.split('/')[-1]
113	save_dir = f'{save_root}{gallery_name}'
114
115	with ThreadPoolExecutor(max_workers=10) as executor:
116	futures = []
117	for url in urls:
118	futures.append(executor.submit(fetch_and_download_image, url, save_dir))
119
120	for future in as_completed(futures):
121	future.result()
122
123
124	def fetch_and_download_image(url, save_dir):
125	try:
126	imagebam_content = fetch_webpage_with_cookie(url)
127	image_src = parse_imagebam_page(imagebam_content)
128	if image_src:
129	download_image(image_src, save_dir)
130	except Exception as e:
131	print(f"Failed to download the image: {e}")
132
133
134	# Main function
135	def main():
136	save_root = './'
137	scrape_root = 'https://bitch-show.com/page/'
138
139	for i in range(1, 501):
140	page_url = f'{scrape_root}{i}'
141	print(f'Working on Page {i} at {page_url}')
142
143	page_html_content = fetch_webpage(page_url)
144	gallery_urls = parse_galleries_from_root(page_html_content)
145
146	for gallery_url in gallery_urls:
147	download_gallery_images(gallery_url, save_root)
148
149
150	if __name__ == "__main__":
151	main()