Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

works on selenium 4.7.2 and google on Jan. 11, 2023. #33

Open
wants to merge 2 commits into
base: patch-1
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 40 additions & 9 deletions google_images_download/google_images_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,10 @@ def download_page(self, url):
def download_extended_page(self, url, chromedriver, browser):
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
if selenium.__version__ >= '4.3':
# find_element_by_XXXXs are duplicated, instead find_element with By.ID
# https://www.selenium.dev/documentation/webdriver/getting_started/upgrade_to_selenium_4/
from selenium.webdriver.common.by import By
if sys.version_info[0] < 3:
reload(sys)
sys.setdefaultencoding('utf8')
Expand Down Expand Up @@ -307,21 +311,31 @@ def download_extended_page(self, url, chromedriver, browser):

# Bypass "Before you continue" if it appears
try:
browser.find_element_by_css_selector("[aria-label='Accept all']").click()
if selenium.__version__ >= '4.3':
browser.find_element(By.CSS_SELECTOR, "[aria-label='Accept all']" ).click()
else:
browser.find_element_by_css_selector("[aria-label='Accept all']").click()
time.sleep(1)
except selenium.common.exceptions.NoSuchElementException:
pass

print("Getting you a lot of images. This may take a few moments...")

element = browser.find_element_by_tag_name("body")
if selenium.__version__ >= '4.3':
element = browser.find_element(By.TAG_NAME, "body")
else:
element = browser.find_element_by_tag_name("body")

# Scroll down
for i in range(50):
element.send_keys(Keys.PAGE_DOWN)
time.sleep(0.3)

try:
browser.find_element_by_xpath('//input[@value="Show more results"]').click()
if selenium.__version__ >= '4.3':
browser.find_element(By.XPATH, '//input[@value="Show more results"]').click()
else:
browser.find_element_by_xpath('//input[@value="Show more results"]').click()
for i in range(50):
element.send_keys(Keys.PAGE_DOWN)
time.sleep(0.3) # bot id protection
Expand Down Expand Up @@ -402,22 +416,39 @@ def get_all_tabs(self, page):
def format_object(self, object):
data = object[1]
main = data[3]
info = data[9]
if info is None:
info = data[11]
formatted_object = {}
try:
formatted_object['image_height'] = main[2]
formatted_object['image_width'] = main[1]
formatted_object['image_link'] = main[0]
formatted_object['image_format'] = main[0][-1 * (len(main[0]) - main[0].rfind(".") - 1):]
formatted_object['image_description'] = info['2003'][3]
formatted_object['image_host'] = info['2003'][17]
formatted_object['image_source'] = info['2003'][2]
formatted_object['image_thumbnail_url'] = data[2][0]
except Exception as e:
print(e)
return None

# google sometimes seems to change the location of item.
# Start searching from the bottom, because data[23] is the one on Jan 19, 2023.
item_number = len(data) -1
while item_number >= 0:
if type(data[item_number]) is dict:
info = data[item_number]
try:
formatted_object['image_description'] = info['2003'][3]
formatted_object['image_host'] = info['2003'][17]
formatted_object['image_source'] = info['2003'][2]
except Exception as e:
print(e)
return None
break
item_number -= 1
else:
# Doesn't have to raise exception.
print('Failed to find image_description, host, and source.')
formatted_object['image_description'] = 'NA'
formatted_object['image_host'] = 'NA'
formatted_object['image_source'] = 'NA'

return formatted_object

# function to download single image
Expand Down