rip-imagebam.py
· 4.6 KiB · Python
Raw
# from: https://www.reddit.com/r/DataHoarder/comments/1d90f3c/nsfw_best_way_to_download_hundreds_of_separate/
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
def fetch_webpage(url):
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
return None
def parse_galleries_from_root(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
main_content = soup.find('div', class_='flex-container')
if main_content:
links = main_content.find_all('a')
urls = []
for link in links:
href = link.get('href')
if href and 'https://bitch-show.com/gallery/' in href:
urls.append(href)
return urls
def parse_imagehost_urls_from_gallery(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
td_content = soup.find('div', class_='flex-container').find('td', class_='content')
if td_content:
links = td_content.find_all('a')
urls = []
for link in links:
href = link.get('href')
if href and 'https://www.imagebam.com/view/' in href:
start_index = href.find('https://www.imagebam.com/view/')
if start_index != -1:
full_url = href[start_index:]
urls.append(full_url)
return urls
else:
print("No `td` element with class `content` found.")
return []
def get_cookie_expiration():
current_datetime = datetime.now()
new_datetime = current_datetime + timedelta(hours=6)
return new_datetime.strftime("%a, %d-%b-%Y %H:%M:%S GMT")
def fetch_webpage_with_cookie(url):
expiration = get_cookie_expiration()
cookies = {
'nsfw_inter': '1',
'expires': expiration,
'path': '/'
}
with requests.Session() as session:
response = session.get(url, cookies=cookies)
if response.status_code == 200:
return response.text
else:
print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
return None
def parse_imagebam_page(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
image_tag = soup.find('img', class_='main-image')
if image_tag:
image_url = image_tag.get('src')
return image_url
else:
print("No image found on the page.")
return None
def download_image(url, save_dir):
try:
os.makedirs(save_dir, exist_ok=True)
filename = url.split('/')[-1]
save_path = os.path.join(save_dir, filename)
response = requests.get(url)
response.raise_for_status()
if os.path.exists(save_path):
print(f"{save_path} already exists.")
else:
with open(save_path, 'wb') as file:
file.write(response.content)
print(f"Image successfully downloaded and saved to {save_path}")
except requests.exceptions.RequestException as e:
print(f"Failed to download the image: {e}")
def download_gallery_images(gallery_url, save_root):
print(f'Trying to download gallery at {gallery_url}')
html_content = fetch_webpage(gallery_url)
if html_content:
urls = parse_imagehost_urls_from_gallery(html_content)
gallery_name = gallery_url.split('/')[-1]
save_dir = f'{save_root}{gallery_name}'
with ThreadPoolExecutor(max_workers=10) as executor:
futures = []
for url in urls:
futures.append(executor.submit(fetch_and_download_image, url, save_dir))
for future in as_completed(futures):
future.result()
def fetch_and_download_image(url, save_dir):
try:
imagebam_content = fetch_webpage_with_cookie(url)
image_src = parse_imagebam_page(imagebam_content)
if image_src:
download_image(image_src, save_dir)
except Exception as e:
print(f"Failed to download the image: {e}")
# Main function
def main():
save_root = './'
scrape_root = 'https://bitch-show.com/page/'
for i in range(1, 501):
page_url = f'{scrape_root}{i}'
print(f'Working on Page {i} at {page_url}')
page_html_content = fetch_webpage(page_url)
gallery_urls = parse_galleries_from_root(page_html_content)
for gallery_url in gallery_urls:
download_gallery_images(gallery_url, save_root)
if __name__ == "__main__":
main()
1 | # from: https://www.reddit.com/r/DataHoarder/comments/1d90f3c/nsfw_best_way_to_download_hundreds_of_separate/ |
2 | import requests |
3 | from bs4 import BeautifulSoup |
4 | from datetime import datetime, timedelta |
5 | import os |
6 | from concurrent.futures import ThreadPoolExecutor, as_completed |
7 | |
8 | |
9 | def fetch_webpage(url): |
10 | response = requests.get(url) |
11 | if response.status_code == 200: |
12 | return response.text |
13 | else: |
14 | print(f"Failed to retrieve the webpage. Status code: {response.status_code}") |
15 | return None |
16 | |
17 | |
18 | def parse_galleries_from_root(html_content): |
19 | soup = BeautifulSoup(html_content, 'html.parser') |
20 | main_content = soup.find('div', class_='flex-container') |
21 | |
22 | if main_content: |
23 | links = main_content.find_all('a') |
24 | |
25 | urls = [] |
26 | for link in links: |
27 | href = link.get('href') |
28 | if href and 'https://bitch-show.com/gallery/' in href: |
29 | urls.append(href) |
30 | return urls |
31 | |
32 | |
33 | def parse_imagehost_urls_from_gallery(html_content): |
34 | soup = BeautifulSoup(html_content, 'html.parser') |
35 | td_content = soup.find('div', class_='flex-container').find('td', class_='content') |
36 | if td_content: |
37 | links = td_content.find_all('a') |
38 | |
39 | urls = [] |
40 | for link in links: |
41 | href = link.get('href') |
42 | if href and 'https://www.imagebam.com/view/' in href: |
43 | start_index = href.find('https://www.imagebam.com/view/') |
44 | if start_index != -1: |
45 | full_url = href[start_index:] |
46 | urls.append(full_url) |
47 | return urls |
48 | |
49 | else: |
50 | print("No `td` element with class `content` found.") |
51 | return [] |
52 | |
53 | |
54 | def get_cookie_expiration(): |
55 | current_datetime = datetime.now() |
56 | new_datetime = current_datetime + timedelta(hours=6) |
57 | return new_datetime.strftime("%a, %d-%b-%Y %H:%M:%S GMT") |
58 | |
59 | |
60 | def fetch_webpage_with_cookie(url): |
61 | expiration = get_cookie_expiration() |
62 | cookies = { |
63 | 'nsfw_inter': '1', |
64 | 'expires': expiration, |
65 | 'path': '/' |
66 | } |
67 | |
68 | with requests.Session() as session: |
69 | response = session.get(url, cookies=cookies) |
70 | if response.status_code == 200: |
71 | return response.text |
72 | else: |
73 | print(f"Failed to retrieve the webpage. Status code: {response.status_code}") |
74 | return None |
75 | |
76 | |
77 | def parse_imagebam_page(html_content): |
78 | soup = BeautifulSoup(html_content, 'html.parser') |
79 | image_tag = soup.find('img', class_='main-image') |
80 | if image_tag: |
81 | image_url = image_tag.get('src') |
82 | return image_url |
83 | else: |
84 | print("No image found on the page.") |
85 | return None |
86 | |
87 | |
88 | def download_image(url, save_dir): |
89 | try: |
90 | os.makedirs(save_dir, exist_ok=True) |
91 | filename = url.split('/')[-1] |
92 | save_path = os.path.join(save_dir, filename) |
93 | |
94 | response = requests.get(url) |
95 | response.raise_for_status() |
96 | |
97 | if os.path.exists(save_path): |
98 | print(f"{save_path} already exists.") |
99 | else: |
100 | with open(save_path, 'wb') as file: |
101 | file.write(response.content) |
102 | print(f"Image successfully downloaded and saved to {save_path}") |
103 | except requests.exceptions.RequestException as e: |
104 | print(f"Failed to download the image: {e}") |
105 | |
106 | |
107 | def download_gallery_images(gallery_url, save_root): |
108 | print(f'Trying to download gallery at {gallery_url}') |
109 | html_content = fetch_webpage(gallery_url) |
110 | if html_content: |
111 | urls = parse_imagehost_urls_from_gallery(html_content) |
112 | gallery_name = gallery_url.split('/')[-1] |
113 | save_dir = f'{save_root}{gallery_name}' |
114 | |
115 | with ThreadPoolExecutor(max_workers=10) as executor: |
116 | futures = [] |
117 | for url in urls: |
118 | futures.append(executor.submit(fetch_and_download_image, url, save_dir)) |
119 | |
120 | for future in as_completed(futures): |
121 | future.result() |
122 | |
123 | |
124 | def fetch_and_download_image(url, save_dir): |
125 | try: |
126 | imagebam_content = fetch_webpage_with_cookie(url) |
127 | image_src = parse_imagebam_page(imagebam_content) |
128 | if image_src: |
129 | download_image(image_src, save_dir) |
130 | except Exception as e: |
131 | print(f"Failed to download the image: {e}") |
132 | |
133 | |
134 | # Main function |
135 | def main(): |
136 | save_root = './' |
137 | scrape_root = 'https://bitch-show.com/page/' |
138 | |
139 | for i in range(1, 501): |
140 | page_url = f'{scrape_root}{i}' |
141 | print(f'Working on Page {i} at {page_url}') |
142 | |
143 | page_html_content = fetch_webpage(page_url) |
144 | gallery_urls = parse_galleries_from_root(page_html_content) |
145 | |
146 | for gallery_url in gallery_urls: |
147 | download_gallery_images(gallery_url, save_root) |
148 | |
149 | |
150 | if __name__ == "__main__": |
151 | main() |