Revision of rip-imagebam.py

1

+

# from: https://www.reddit.com/r/DataHoarder/comments/1d90f3c/nsfw_best_way_to_download_hundreds_of_separate/

2

+

import requests

3

+

from bs4 import BeautifulSoup

4

+

from datetime import datetime, timedelta

5

+

import os

6

+

from concurrent.futures import ThreadPoolExecutor, as_completed

7

+

8

+

9

+

def fetch_webpage(url):

10

+

response = requests.get(url)

11

+

if response.status_code == 200:

12

+

return response.text

13

+

else:

14

+

print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

15

+

return None

16

+

17

+

18

+

def parse_galleries_from_root(html_content):

19

+

soup = BeautifulSoup(html_content, 'html.parser')

20

+

main_content = soup.find('div', class_='flex-container')

21

+

22

+

if main_content:

23

+

links = main_content.find_all('a')

24

+

25

+

urls = []

26

+

for link in links:

27

+

href = link.get('href')

28

+

if href and 'https://bitch-show.com/gallery/' in href:

29

+

urls.append(href)

30

+

return urls

31

+

32

+

33

+

def parse_imagehost_urls_from_gallery(html_content):

34

+

soup = BeautifulSoup(html_content, 'html.parser')

35

+

td_content = soup.find('div', class_='flex-container').find('td', class_='content')

36

+

if td_content:

37

+

links = td_content.find_all('a')

38

+

39

+

urls = []

40

+

for link in links:

41

+

href = link.get('href')

42

+

if href and 'https://www.imagebam.com/view/' in href:

43

+

start_index = href.find('https://www.imagebam.com/view/')

44

+

if start_index != -1:

45

+

full_url = href[start_index:]

46

+

urls.append(full_url)

47

+

return urls

48

+

49

+

else:

50

+

print("No `td` element with class `content` found.")

51

+

return []

52

+

53

+

54

+

def get_cookie_expiration():

55

+

current_datetime = datetime.now()

56

+

new_datetime = current_datetime + timedelta(hours=6)

57

+

return new_datetime.strftime("%a, %d-%b-%Y %H:%M:%S GMT")

58

+

59

+

60

+

def fetch_webpage_with_cookie(url):

61

+

expiration = get_cookie_expiration()

62

+

cookies = {

63

+

'nsfw_inter': '1',

64

+

'expires': expiration,

65

+

'path': '/'

66

+

}

67

+

68

+

with requests.Session() as session:

69

+

response = session.get(url, cookies=cookies)

70

+

if response.status_code == 200:

71

+

return response.text

72

+

else:

73

+

print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

74

+

return None

75

+

76

+

77

+

def parse_imagebam_page(html_content):

78

+

soup = BeautifulSoup(html_content, 'html.parser')

79

+

image_tag = soup.find('img', class_='main-image')

80

+

if image_tag:

81

+

image_url = image_tag.get('src')

82

+

return image_url

83

+

else:

84

+

print("No image found on the page.")

85

+

return None

86

+

87

+

88

+

def download_image(url, save_dir):

89

+

try:

90

+

os.makedirs(save_dir, exist_ok=True)

91

+

filename = url.split('/')[-1]

92

+

save_path = os.path.join(save_dir, filename)

93

+

94

+

response = requests.get(url)

95

+

response.raise_for_status()

96

+

97

+

if os.path.exists(save_path):

98

+

print(f"{save_path} already exists.")

99

+

else:

100

+

with open(save_path, 'wb') as file:

101

+

file.write(response.content)

102

+

print(f"Image successfully downloaded and saved to {save_path}")

103

+

except requests.exceptions.RequestException as e:

104

+

print(f"Failed to download the image: {e}")

105

+

106

+

107

+

def download_gallery_images(gallery_url, save_root):

108

+

print(f'Trying to download gallery at {gallery_url}')

109

+

html_content = fetch_webpage(gallery_url)

110

+

if html_content:

111

+

urls = parse_imagehost_urls_from_gallery(html_content)

112

+

gallery_name = gallery_url.split('/')[-1]

113

+

save_dir = f'{save_root}{gallery_name}'

114

+

115

+

with ThreadPoolExecutor(max_workers=10) as executor:

116

+

futures = []

117

+

for url in urls:

118

+

futures.append(executor.submit(fetch_and_download_image, url, save_dir))

119

+

120

+

for future in as_completed(futures):

121

+

future.result()

122

+

123

+

124

+

def fetch_and_download_image(url, save_dir):

125

+

try:

126

+

imagebam_content = fetch_webpage_with_cookie(url)

127

+

image_src = parse_imagebam_page(imagebam_content)

128

+

if image_src:

129

+

download_image(image_src, save_dir)

130

+

except Exception as e:

131

+

print(f"Failed to download the image: {e}")

132

+

133

+

134

+

# Main function

135

+

def main():

136

+

save_root = './'

137

+

scrape_root = 'https://bitch-show.com/page/'

138

+

139

+

for i in range(1, 501):

140

+

page_url = f'{scrape_root}{i}'

141

+

print(f'Working on Page {i} at {page_url}')

142

+

143

+

page_html_content = fetch_webpage(page_url)

144

+

gallery_urls = parse_galleries_from_root(page_html_content)

145

+

146

+

for gallery_url in gallery_urls:

147

+

download_gallery_images(gallery_url, save_root)

148

+

149

+

150

+

if __name__ == "__main__":

151

+

main()

Malin / rip-imagebam.py

Malin revised this gist 2 years ago. Go to revision

		@@ -0,0 +1,151 @@
1	+	# from: https://www.reddit.com/r/DataHoarder/comments/1d90f3c/nsfw_best_way_to_download_hundreds_of_separate/
2	+	import requests
3	+	from bs4 import BeautifulSoup
4	+	from datetime import datetime, timedelta
5	+	import os
6	+	from concurrent.futures import ThreadPoolExecutor, as_completed
7	+
8	+
9	+	def fetch_webpage(url):
10	+	response = requests.get(url)
11	+	if response.status_code == 200:
12	+	return response.text
13	+	else:
14	+	print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
15	+	return None
16	+
17	+
18	+	def parse_galleries_from_root(html_content):
19	+	soup = BeautifulSoup(html_content, 'html.parser')
20	+	main_content = soup.find('div', class_='flex-container')
21	+
22	+	if main_content:
23	+	links = main_content.find_all('a')
24	+
25	+	urls = []
26	+	for link in links:
27	+	href = link.get('href')
28	+	if href and 'https://bitch-show.com/gallery/' in href:
29	+	urls.append(href)
30	+	return urls
31	+
32	+
33	+	def parse_imagehost_urls_from_gallery(html_content):
34	+	soup = BeautifulSoup(html_content, 'html.parser')
35	+	td_content = soup.find('div', class_='flex-container').find('td', class_='content')
36	+	if td_content:
37	+	links = td_content.find_all('a')
38	+
39	+	urls = []
40	+	for link in links:
41	+	href = link.get('href')
42	+	if href and 'https://www.imagebam.com/view/' in href:
43	+	start_index = href.find('https://www.imagebam.com/view/')
44	+	if start_index != -1:
45	+	full_url = href[start_index:]
46	+	urls.append(full_url)
47	+	return urls
48	+
49	+	else:
50	+	print("No `td` element with class `content` found.")
51	+	return []
52	+
53	+
54	+	def get_cookie_expiration():
55	+	current_datetime = datetime.now()
56	+	new_datetime = current_datetime + timedelta(hours=6)
57	+	return new_datetime.strftime("%a, %d-%b-%Y %H:%M:%S GMT")
58	+
59	+
60	+	def fetch_webpage_with_cookie(url):
61	+	expiration = get_cookie_expiration()
62	+	cookies = {
63	+	'nsfw_inter': '1',
64	+	'expires': expiration,
65	+	'path': '/'
66	+	}
67	+
68	+	with requests.Session() as session:
69	+	response = session.get(url, cookies=cookies)
70	+	if response.status_code == 200:
71	+	return response.text
72	+	else:
73	+	print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
74	+	return None
75	+
76	+
77	+	def parse_imagebam_page(html_content):
78	+	soup = BeautifulSoup(html_content, 'html.parser')
79	+	image_tag = soup.find('img', class_='main-image')
80	+	if image_tag:
81	+	image_url = image_tag.get('src')
82	+	return image_url
83	+	else:
84	+	print("No image found on the page.")
85	+	return None
86	+
87	+
88	+	def download_image(url, save_dir):
89	+	try:
90	+	os.makedirs(save_dir, exist_ok=True)
91	+	filename = url.split('/')[-1]
92	+	save_path = os.path.join(save_dir, filename)
93	+
94	+	response = requests.get(url)
95	+	response.raise_for_status()
96	+
97	+	if os.path.exists(save_path):
98	+	print(f"{save_path} already exists.")
99	+	else:
100	+	with open(save_path, 'wb') as file:
101	+	file.write(response.content)
102	+	print(f"Image successfully downloaded and saved to {save_path}")
103	+	except requests.exceptions.RequestException as e:
104	+	print(f"Failed to download the image: {e}")
105	+
106	+
107	+	def download_gallery_images(gallery_url, save_root):
108	+	print(f'Trying to download gallery at {gallery_url}')
109	+	html_content = fetch_webpage(gallery_url)
110	+	if html_content:
111	+	urls = parse_imagehost_urls_from_gallery(html_content)
112	+	gallery_name = gallery_url.split('/')[-1]
113	+	save_dir = f'{save_root}{gallery_name}'
114	+
115	+	with ThreadPoolExecutor(max_workers=10) as executor:
116	+	futures = []
117	+	for url in urls:
118	+	futures.append(executor.submit(fetch_and_download_image, url, save_dir))
119	+
120	+	for future in as_completed(futures):
121	+	future.result()
122	+
123	+
124	+	def fetch_and_download_image(url, save_dir):
125	+	try:
126	+	imagebam_content = fetch_webpage_with_cookie(url)
127	+	image_src = parse_imagebam_page(imagebam_content)
128	+	if image_src:
129	+	download_image(image_src, save_dir)
130	+	except Exception as e:
131	+	print(f"Failed to download the image: {e}")
132	+
133	+
134	+	# Main function
135	+	def main():
136	+	save_root = './'
137	+	scrape_root = 'https://bitch-show.com/page/'
138	+
139	+	for i in range(1, 501):
140	+	page_url = f'{scrape_root}{i}'
141	+	print(f'Working on Page {i} at {page_url}')
142	+
143	+	page_html_content = fetch_webpage(page_url)
144	+	gallery_urls = parse_galleries_from_root(page_html_content)
145	+
146	+	for gallery_url in gallery_urls:
147	+	download_gallery_images(gallery_url, save_root)
148	+
149	+
150	+	if __name__ == "__main__":
151	+	main()