# from: https://www.reddit.com/r/DataHoarder/comments/1d90f3c/nsfw_best_way_to_download_hundreds_of_separate/ import requests from bs4 import BeautifulSoup from datetime import datetime, timedelta import os from concurrent.futures import ThreadPoolExecutor, as_completed def fetch_webpage(url): response = requests.get(url) if response.status_code == 200: return response.text else: print(f"Failed to retrieve the webpage. Status code: {response.status_code}") return None def parse_galleries_from_root(html_content): soup = BeautifulSoup(html_content, 'html.parser') main_content = soup.find('div', class_='flex-container') if main_content: links = main_content.find_all('a') urls = [] for link in links: href = link.get('href') if href and 'https://bitch-show.com/gallery/' in href: urls.append(href) return urls def parse_imagehost_urls_from_gallery(html_content): soup = BeautifulSoup(html_content, 'html.parser') td_content = soup.find('div', class_='flex-container').find('td', class_='content') if td_content: links = td_content.find_all('a') urls = [] for link in links: href = link.get('href') if href and 'https://www.imagebam.com/view/' in href: start_index = href.find('https://www.imagebam.com/view/') if start_index != -1: full_url = href[start_index:] urls.append(full_url) return urls else: print("No `td` element with class `content` found.") return [] def get_cookie_expiration(): current_datetime = datetime.now() new_datetime = current_datetime + timedelta(hours=6) return new_datetime.strftime("%a, %d-%b-%Y %H:%M:%S GMT") def fetch_webpage_with_cookie(url): expiration = get_cookie_expiration() cookies = { 'nsfw_inter': '1', 'expires': expiration, 'path': '/' } with requests.Session() as session: response = session.get(url, cookies=cookies) if response.status_code == 200: return response.text else: print(f"Failed to retrieve the webpage. Status code: {response.status_code}") return None def parse_imagebam_page(html_content): soup = BeautifulSoup(html_content, 'html.parser') image_tag = soup.find('img', class_='main-image') if image_tag: image_url = image_tag.get('src') return image_url else: print("No image found on the page.") return None def download_image(url, save_dir): try: os.makedirs(save_dir, exist_ok=True) filename = url.split('/')[-1] save_path = os.path.join(save_dir, filename) response = requests.get(url) response.raise_for_status() if os.path.exists(save_path): print(f"{save_path} already exists.") else: with open(save_path, 'wb') as file: file.write(response.content) print(f"Image successfully downloaded and saved to {save_path}") except requests.exceptions.RequestException as e: print(f"Failed to download the image: {e}") def download_gallery_images(gallery_url, save_root): print(f'Trying to download gallery at {gallery_url}') html_content = fetch_webpage(gallery_url) if html_content: urls = parse_imagehost_urls_from_gallery(html_content) gallery_name = gallery_url.split('/')[-1] save_dir = f'{save_root}{gallery_name}' with ThreadPoolExecutor(max_workers=10) as executor: futures = [] for url in urls: futures.append(executor.submit(fetch_and_download_image, url, save_dir)) for future in as_completed(futures): future.result() def fetch_and_download_image(url, save_dir): try: imagebam_content = fetch_webpage_with_cookie(url) image_src = parse_imagebam_page(imagebam_content) if image_src: download_image(image_src, save_dir) except Exception as e: print(f"Failed to download the image: {e}") # Main function def main(): save_root = './' scrape_root = 'https://bitch-show.com/page/' for i in range(1, 501): page_url = f'{scrape_root}{i}' print(f'Working on Page {i} at {page_url}') page_html_content = fetch_webpage(page_url) gallery_urls = parse_galleries_from_root(page_html_content) for gallery_url in gallery_urls: download_gallery_images(gallery_url, save_root) if __name__ == "__main__": main()