# Simple Sample Amazon Scraper by Cash Barnes # requirements: # Selenium # Chrome (make sure you are using the correct version of chromedriver for your browser) # BeautifulSoup4 # sample_input.csv with list of search terms. Each search term should be the only thing on the row. from selenium import webdriver # make sure you are using the correct webdriver for the version of Chrome you are using import time # import time so that we can use time.sleep() as needed when loading new pages import csv # import csv so that we can read and write from and to csv files from selenium.webdriver.common.keys import Keys # import keys so that we can send key strokes to the pages our scraper is visiting from bs4 import BeautifulSoup sample_input = 'sample_input.csv' #change this to the name of the input csv you want to have your scraper use sample_output = 'sample_output.csv' search_url = 'http://amazon.com' #change this to the url that you want to have your scraper visit # Here we will open our sample output file and clear it with 'w'. with open(sample_output, 'w', newline='') as csvfile: namewriter = csv.writer(csvfile) namewriter.writerow(['Search Term','Top Item','Rating','URL']) # this will print the first line of our output file, which will be a header. # Open our input file and begin scraping Amazon with open(sample_input, newline='') as csvfile: search_term_reader = csv.reader(csvfile, delimiter=",") driver = webdriver.Chrome('chromedriver.exe') driver.get('http://amazon.com') for row in search_term_reader: search_term = row[0] # this will assign the first column in the input file to the search term variable (there should only be on column) # for this example, we will be using driver.find_element to find the search box. You could also just use the search url paired with the search terms. search_box = driver.find_element_by_id('twotabsearchtextbox').clear() # find and clear the search box on the site you want to scrape. search_box = driver.find_element_by_id('twotabsearchtextbox') # after clearing the search box, this is re-assigning it to the search_box variable search_box.send_keys(search_term) # this will use send_keys to type the search_term into the search box search_box.send_keys(Keys.RETURN) # this will hit the enter key after sending the search_term to the search box time.sleep(.5) # this is to allow a brief pause for the page to load after hitting enter soup = BeautifulSoup(driver.page_source, 'html.parser') # we will use BeautifulSoup to parse page as html results = soup.find_all('div', {'data-component-type': 's-search-result'}) # we can use the inspect tool in chrome to find unique identifiers for the elements we want to return item = results[0] atag = item.h2.a description = atag.text.strip() url = 'https://www.amazon.com' + atag.get('href') #price_parent = item.find('span', 'a-price') this will return the parent span to the class that has price #price = price_parent.find('span', 'a-offscreen').text this will return the price span within the parent span. These lines are commented out since all items don't have a price rating = item.i.text #review_count = item.find('span',{'class': 'a-size-base', 'dir': 'auto'}).text # Not consistently pointing to review count so commented out. with open(sample_output, 'a', newline='') as csvfile: namewriter = csv.writer(csvfile) namewriter.writerow([search_term, description, rating, url]) driver.quit()