Last active 1717755548

rip-imagebam.py Raw
1# from: https://www.reddit.com/r/DataHoarder/comments/1d90f3c/nsfw_best_way_to_download_hundreds_of_separate/
2import requests
3from bs4 import BeautifulSoup
4from datetime import datetime, timedelta
5import os
6from concurrent.futures import ThreadPoolExecutor, as_completed
7
8
9def fetch_webpage(url):
10 response = requests.get(url)
11 if response.status_code == 200:
12 return response.text
13 else:
14 print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
15 return None
16
17
18def parse_galleries_from_root(html_content):
19 soup = BeautifulSoup(html_content, 'html.parser')
20 main_content = soup.find('div', class_='flex-container')
21
22 if main_content:
23 links = main_content.find_all('a')
24
25 urls = []
26 for link in links:
27 href = link.get('href')
28 if href and 'https://bitch-show.com/gallery/' in href:
29 urls.append(href)
30 return urls
31
32
33def parse_imagehost_urls_from_gallery(html_content):
34 soup = BeautifulSoup(html_content, 'html.parser')
35 td_content = soup.find('div', class_='flex-container').find('td', class_='content')
36 if td_content:
37 links = td_content.find_all('a')
38
39 urls = []
40 for link in links:
41 href = link.get('href')
42 if href and 'https://www.imagebam.com/view/' in href:
43 start_index = href.find('https://www.imagebam.com/view/')
44 if start_index != -1:
45 full_url = href[start_index:]
46 urls.append(full_url)
47 return urls
48
49 else:
50 print("No `td` element with class `content` found.")
51 return []
52
53
54def get_cookie_expiration():
55 current_datetime = datetime.now()
56 new_datetime = current_datetime + timedelta(hours=6)
57 return new_datetime.strftime("%a, %d-%b-%Y %H:%M:%S GMT")
58
59
60def fetch_webpage_with_cookie(url):
61 expiration = get_cookie_expiration()
62 cookies = {
63 'nsfw_inter': '1',
64 'expires': expiration,
65 'path': '/'
66 }
67
68 with requests.Session() as session:
69 response = session.get(url, cookies=cookies)
70 if response.status_code == 200:
71 return response.text
72 else:
73 print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
74 return None
75
76
77def parse_imagebam_page(html_content):
78 soup = BeautifulSoup(html_content, 'html.parser')
79 image_tag = soup.find('img', class_='main-image')
80 if image_tag:
81 image_url = image_tag.get('src')
82 return image_url
83 else:
84 print("No image found on the page.")
85 return None
86
87
88def download_image(url, save_dir):
89 try:
90 os.makedirs(save_dir, exist_ok=True)
91 filename = url.split('/')[-1]
92 save_path = os.path.join(save_dir, filename)
93
94 response = requests.get(url)
95 response.raise_for_status()
96
97 if os.path.exists(save_path):
98 print(f"{save_path} already exists.")
99 else:
100 with open(save_path, 'wb') as file:
101 file.write(response.content)
102 print(f"Image successfully downloaded and saved to {save_path}")
103 except requests.exceptions.RequestException as e:
104 print(f"Failed to download the image: {e}")
105
106
107def download_gallery_images(gallery_url, save_root):
108 print(f'Trying to download gallery at {gallery_url}')
109 html_content = fetch_webpage(gallery_url)
110 if html_content:
111 urls = parse_imagehost_urls_from_gallery(html_content)
112 gallery_name = gallery_url.split('/')[-1]
113 save_dir = f'{save_root}{gallery_name}'
114
115 with ThreadPoolExecutor(max_workers=10) as executor:
116 futures = []
117 for url in urls:
118 futures.append(executor.submit(fetch_and_download_image, url, save_dir))
119
120 for future in as_completed(futures):
121 future.result()
122
123
124def fetch_and_download_image(url, save_dir):
125 try:
126 imagebam_content = fetch_webpage_with_cookie(url)
127 image_src = parse_imagebam_page(imagebam_content)
128 if image_src:
129 download_image(image_src, save_dir)
130 except Exception as e:
131 print(f"Failed to download the image: {e}")
132
133
134# Main function
135def main():
136 save_root = './'
137 scrape_root = 'https://bitch-show.com/page/'
138
139 for i in range(1, 501):
140 page_url = f'{scrape_root}{i}'
141 print(f'Working on Page {i} at {page_url}')
142
143 page_html_content = fetch_webpage(page_url)
144 gallery_urls = parse_galleries_from_root(page_html_content)
145
146 for gallery_url in gallery_urls:
147 download_gallery_images(gallery_url, save_root)
148
149
150if __name__ == "__main__":
151 main()