The below Python class is referenced by both the web scraper and the web crawler.
import uuid
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
class SeleniumCrawler:
def __init__(self):
DesiredCapabilities.PHANTOMJS['phantomjs.page.settings.userAgent'] = \n 'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0'
self.browser = webdriver.PhantomJS(executable_path='/usr/local/lib/node_modules/phantomjs/lib/phantom/bin/phantomjs')
self.browser.set_window_size(1920, 1080)
def navigate(self, url):
self.browser.get(url)
self.setup_scripts()
return self.browser
def setup_scripts(self):
self.run_multiple_js(['lodash.min.js', 'jquery-2.1.4.min.js', 'whitebg.js'])
def run_js(self, js_path):
with open(js_path, 'r') as js_file:
js_text = js_file.read()
return self.browser.execute_script(js_text + '; return window.WebScrapeNS.data;')
def run_multiple_js(self, js_paths):
results = []
for js_path in js_paths:
results.append(self.run_js(js_path))
return results
def screenshot(self):
file_name = str(uuid.uuid4()) + '.png'
self.browser.save_screenshot('screenshots/' + file_name)
Leave a comment