Last active 1717755548

Malin's Avatar Malin revised this gist 1717755548. Go to revision

1 file changed, 151 insertions

rip-imagebam.py(file created)

@@ -0,0 +1,151 @@
1 + # from: https://www.reddit.com/r/DataHoarder/comments/1d90f3c/nsfw_best_way_to_download_hundreds_of_separate/
2 + import requests
3 + from bs4 import BeautifulSoup
4 + from datetime import datetime, timedelta
5 + import os
6 + from concurrent.futures import ThreadPoolExecutor, as_completed
7 +
8 +
9 + def fetch_webpage(url):
10 + response = requests.get(url)
11 + if response.status_code == 200:
12 + return response.text
13 + else:
14 + print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
15 + return None
16 +
17 +
18 + def parse_galleries_from_root(html_content):
19 + soup = BeautifulSoup(html_content, 'html.parser')
20 + main_content = soup.find('div', class_='flex-container')
21 +
22 + if main_content:
23 + links = main_content.find_all('a')
24 +
25 + urls = []
26 + for link in links:
27 + href = link.get('href')
28 + if href and 'https://bitch-show.com/gallery/' in href:
29 + urls.append(href)
30 + return urls
31 +
32 +
33 + def parse_imagehost_urls_from_gallery(html_content):
34 + soup = BeautifulSoup(html_content, 'html.parser')
35 + td_content = soup.find('div', class_='flex-container').find('td', class_='content')
36 + if td_content:
37 + links = td_content.find_all('a')
38 +
39 + urls = []
40 + for link in links:
41 + href = link.get('href')
42 + if href and 'https://www.imagebam.com/view/' in href:
43 + start_index = href.find('https://www.imagebam.com/view/')
44 + if start_index != -1:
45 + full_url = href[start_index:]
46 + urls.append(full_url)
47 + return urls
48 +
49 + else:
50 + print("No `td` element with class `content` found.")
51 + return []
52 +
53 +
54 + def get_cookie_expiration():
55 + current_datetime = datetime.now()
56 + new_datetime = current_datetime + timedelta(hours=6)
57 + return new_datetime.strftime("%a, %d-%b-%Y %H:%M:%S GMT")
58 +
59 +
60 + def fetch_webpage_with_cookie(url):
61 + expiration = get_cookie_expiration()
62 + cookies = {
63 + 'nsfw_inter': '1',
64 + 'expires': expiration,
65 + 'path': '/'
66 + }
67 +
68 + with requests.Session() as session:
69 + response = session.get(url, cookies=cookies)
70 + if response.status_code == 200:
71 + return response.text
72 + else:
73 + print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
74 + return None
75 +
76 +
77 + def parse_imagebam_page(html_content):
78 + soup = BeautifulSoup(html_content, 'html.parser')
79 + image_tag = soup.find('img', class_='main-image')
80 + if image_tag:
81 + image_url = image_tag.get('src')
82 + return image_url
83 + else:
84 + print("No image found on the page.")
85 + return None
86 +
87 +
88 + def download_image(url, save_dir):
89 + try:
90 + os.makedirs(save_dir, exist_ok=True)
91 + filename = url.split('/')[-1]
92 + save_path = os.path.join(save_dir, filename)
93 +
94 + response = requests.get(url)
95 + response.raise_for_status()
96 +
97 + if os.path.exists(save_path):
98 + print(f"{save_path} already exists.")
99 + else:
100 + with open(save_path, 'wb') as file:
101 + file.write(response.content)
102 + print(f"Image successfully downloaded and saved to {save_path}")
103 + except requests.exceptions.RequestException as e:
104 + print(f"Failed to download the image: {e}")
105 +
106 +
107 + def download_gallery_images(gallery_url, save_root):
108 + print(f'Trying to download gallery at {gallery_url}')
109 + html_content = fetch_webpage(gallery_url)
110 + if html_content:
111 + urls = parse_imagehost_urls_from_gallery(html_content)
112 + gallery_name = gallery_url.split('/')[-1]
113 + save_dir = f'{save_root}{gallery_name}'
114 +
115 + with ThreadPoolExecutor(max_workers=10) as executor:
116 + futures = []
117 + for url in urls:
118 + futures.append(executor.submit(fetch_and_download_image, url, save_dir))
119 +
120 + for future in as_completed(futures):
121 + future.result()
122 +
123 +
124 + def fetch_and_download_image(url, save_dir):
125 + try:
126 + imagebam_content = fetch_webpage_with_cookie(url)
127 + image_src = parse_imagebam_page(imagebam_content)
128 + if image_src:
129 + download_image(image_src, save_dir)
130 + except Exception as e:
131 + print(f"Failed to download the image: {e}")
132 +
133 +
134 + # Main function
135 + def main():
136 + save_root = './'
137 + scrape_root = 'https://bitch-show.com/page/'
138 +
139 + for i in range(1, 501):
140 + page_url = f'{scrape_root}{i}'
141 + print(f'Working on Page {i} at {page_url}')
142 +
143 + page_html_content = fetch_webpage(page_url)
144 + gallery_urls = parse_galleries_from_root(page_html_content)
145 +
146 + for gallery_url in gallery_urls:
147 + download_gallery_images(gallery_url, save_root)
148 +
149 +
150 + if __name__ == "__main__":
151 + main()
Newer Older