rip-imagebam.py
· 4.6 KiB · Python
原始檔案
# from: https://www.reddit.com/r/DataHoarder/comments/1d90f3c/nsfw_best_way_to_download_hundreds_of_separate/
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
def fetch_webpage(url):
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
return None
def parse_galleries_from_root(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
main_content = soup.find('div', class_='flex-container')
if main_content:
links = main_content.find_all('a')
urls = []
for link in links:
href = link.get('href')
if href and 'https://bitch-show.com/gallery/' in href:
urls.append(href)
return urls
def parse_imagehost_urls_from_gallery(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
td_content = soup.find('div', class_='flex-container').find('td', class_='content')
if td_content:
links = td_content.find_all('a')
urls = []
for link in links:
href = link.get('href')
if href and 'https://www.imagebam.com/view/' in href:
start_index = href.find('https://www.imagebam.com/view/')
if start_index != -1:
full_url = href[start_index:]
urls.append(full_url)
return urls
else:
print("No `td` element with class `content` found.")
return []
def get_cookie_expiration():
current_datetime = datetime.now()
new_datetime = current_datetime + timedelta(hours=6)
return new_datetime.strftime("%a, %d-%b-%Y %H:%M:%S GMT")
def fetch_webpage_with_cookie(url):
expiration = get_cookie_expiration()
cookies = {
'nsfw_inter': '1',
'expires': expiration,
'path': '/'
}
with requests.Session() as session:
response = session.get(url, cookies=cookies)
if response.status_code == 200:
return response.text
else:
print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
return None
def parse_imagebam_page(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
image_tag = soup.find('img', class_='main-image')
if image_tag:
image_url = image_tag.get('src')
return image_url
else:
print("No image found on the page.")
return None
def download_image(url, save_dir):
try:
os.makedirs(save_dir, exist_ok=True)
filename = url.split('/')[-1]
save_path = os.path.join(save_dir, filename)
response = requests.get(url)
response.raise_for_status()
if os.path.exists(save_path):
print(f"{save_path} already exists.")
else:
with open(save_path, 'wb') as file:
file.write(response.content)
print(f"Image successfully downloaded and saved to {save_path}")
except requests.exceptions.RequestException as e:
print(f"Failed to download the image: {e}")
def download_gallery_images(gallery_url, save_root):
print(f'Trying to download gallery at {gallery_url}')
html_content = fetch_webpage(gallery_url)
if html_content:
urls = parse_imagehost_urls_from_gallery(html_content)
gallery_name = gallery_url.split('/')[-1]
save_dir = f'{save_root}{gallery_name}'
with ThreadPoolExecutor(max_workers=10) as executor:
futures = []
for url in urls:
futures.append(executor.submit(fetch_and_download_image, url, save_dir))
for future in as_completed(futures):
future.result()
def fetch_and_download_image(url, save_dir):
try:
imagebam_content = fetch_webpage_with_cookie(url)
image_src = parse_imagebam_page(imagebam_content)
if image_src:
download_image(image_src, save_dir)
except Exception as e:
print(f"Failed to download the image: {e}")
# Main function
def main():
save_root = './'
scrape_root = 'https://bitch-show.com/page/'
for i in range(1, 501):
page_url = f'{scrape_root}{i}'
print(f'Working on Page {i} at {page_url}')
page_html_content = fetch_webpage(page_url)
gallery_urls = parse_galleries_from_root(page_html_content)
for gallery_url in gallery_urls:
download_gallery_images(gallery_url, save_root)
if __name__ == "__main__":
main()
| 1 | # from: https://www.reddit.com/r/DataHoarder/comments/1d90f3c/nsfw_best_way_to_download_hundreds_of_separate/ |
| 2 | import requests |
| 3 | from bs4 import BeautifulSoup |
| 4 | from datetime import datetime, timedelta |
| 5 | import os |
| 6 | from concurrent.futures import ThreadPoolExecutor, as_completed |
| 7 | |
| 8 | |
| 9 | def fetch_webpage(url): |
| 10 | response = requests.get(url) |
| 11 | if response.status_code == 200: |
| 12 | return response.text |
| 13 | else: |
| 14 | print(f"Failed to retrieve the webpage. Status code: {response.status_code}") |
| 15 | return None |
| 16 | |
| 17 | |
| 18 | def parse_galleries_from_root(html_content): |
| 19 | soup = BeautifulSoup(html_content, 'html.parser') |
| 20 | main_content = soup.find('div', class_='flex-container') |
| 21 | |
| 22 | if main_content: |
| 23 | links = main_content.find_all('a') |
| 24 | |
| 25 | urls = [] |
| 26 | for link in links: |
| 27 | href = link.get('href') |
| 28 | if href and 'https://bitch-show.com/gallery/' in href: |
| 29 | urls.append(href) |
| 30 | return urls |
| 31 | |
| 32 | |
| 33 | def parse_imagehost_urls_from_gallery(html_content): |
| 34 | soup = BeautifulSoup(html_content, 'html.parser') |
| 35 | td_content = soup.find('div', class_='flex-container').find('td', class_='content') |
| 36 | if td_content: |
| 37 | links = td_content.find_all('a') |
| 38 | |
| 39 | urls = [] |
| 40 | for link in links: |
| 41 | href = link.get('href') |
| 42 | if href and 'https://www.imagebam.com/view/' in href: |
| 43 | start_index = href.find('https://www.imagebam.com/view/') |
| 44 | if start_index != -1: |
| 45 | full_url = href[start_index:] |
| 46 | urls.append(full_url) |
| 47 | return urls |
| 48 | |
| 49 | else: |
| 50 | print("No `td` element with class `content` found.") |
| 51 | return [] |
| 52 | |
| 53 | |
| 54 | def get_cookie_expiration(): |
| 55 | current_datetime = datetime.now() |
| 56 | new_datetime = current_datetime + timedelta(hours=6) |
| 57 | return new_datetime.strftime("%a, %d-%b-%Y %H:%M:%S GMT") |
| 58 | |
| 59 | |
| 60 | def fetch_webpage_with_cookie(url): |
| 61 | expiration = get_cookie_expiration() |
| 62 | cookies = { |
| 63 | 'nsfw_inter': '1', |
| 64 | 'expires': expiration, |
| 65 | 'path': '/' |
| 66 | } |
| 67 | |
| 68 | with requests.Session() as session: |
| 69 | response = session.get(url, cookies=cookies) |
| 70 | if response.status_code == 200: |
| 71 | return response.text |
| 72 | else: |
| 73 | print(f"Failed to retrieve the webpage. Status code: {response.status_code}") |
| 74 | return None |
| 75 | |
| 76 | |
| 77 | def parse_imagebam_page(html_content): |
| 78 | soup = BeautifulSoup(html_content, 'html.parser') |
| 79 | image_tag = soup.find('img', class_='main-image') |
| 80 | if image_tag: |
| 81 | image_url = image_tag.get('src') |
| 82 | return image_url |
| 83 | else: |
| 84 | print("No image found on the page.") |
| 85 | return None |
| 86 | |
| 87 | |
| 88 | def download_image(url, save_dir): |
| 89 | try: |
| 90 | os.makedirs(save_dir, exist_ok=True) |
| 91 | filename = url.split('/')[-1] |
| 92 | save_path = os.path.join(save_dir, filename) |
| 93 | |
| 94 | response = requests.get(url) |
| 95 | response.raise_for_status() |
| 96 | |
| 97 | if os.path.exists(save_path): |
| 98 | print(f"{save_path} already exists.") |
| 99 | else: |
| 100 | with open(save_path, 'wb') as file: |
| 101 | file.write(response.content) |
| 102 | print(f"Image successfully downloaded and saved to {save_path}") |
| 103 | except requests.exceptions.RequestException as e: |
| 104 | print(f"Failed to download the image: {e}") |
| 105 | |
| 106 | |
| 107 | def download_gallery_images(gallery_url, save_root): |
| 108 | print(f'Trying to download gallery at {gallery_url}') |
| 109 | html_content = fetch_webpage(gallery_url) |
| 110 | if html_content: |
| 111 | urls = parse_imagehost_urls_from_gallery(html_content) |
| 112 | gallery_name = gallery_url.split('/')[-1] |
| 113 | save_dir = f'{save_root}{gallery_name}' |
| 114 | |
| 115 | with ThreadPoolExecutor(max_workers=10) as executor: |
| 116 | futures = [] |
| 117 | for url in urls: |
| 118 | futures.append(executor.submit(fetch_and_download_image, url, save_dir)) |
| 119 | |
| 120 | for future in as_completed(futures): |
| 121 | future.result() |
| 122 | |
| 123 | |
| 124 | def fetch_and_download_image(url, save_dir): |
| 125 | try: |
| 126 | imagebam_content = fetch_webpage_with_cookie(url) |
| 127 | image_src = parse_imagebam_page(imagebam_content) |
| 128 | if image_src: |
| 129 | download_image(image_src, save_dir) |
| 130 | except Exception as e: |
| 131 | print(f"Failed to download the image: {e}") |
| 132 | |
| 133 | |
| 134 | # Main function |
| 135 | def main(): |
| 136 | save_root = './' |
| 137 | scrape_root = 'https://bitch-show.com/page/' |
| 138 | |
| 139 | for i in range(1, 501): |
| 140 | page_url = f'{scrape_root}{i}' |
| 141 | print(f'Working on Page {i} at {page_url}') |
| 142 | |
| 143 | page_html_content = fetch_webpage(page_url) |
| 144 | gallery_urls = parse_galleries_from_root(page_html_content) |
| 145 | |
| 146 | for gallery_url in gallery_urls: |
| 147 | download_gallery_images(gallery_url, save_root) |
| 148 | |
| 149 | |
| 150 | if __name__ == "__main__": |
| 151 | main() |