Below is code that helps find visible links on web pages, and then follow those links. The code is still very rough at this point, and is meant to perform a simple demonstration of PhantomJS functionality.
Note: This article is still being written and is currently incomplete.
from selenium_crawler import SeleniumCrawler
from selenium.webdriver.common.keys import Keys
import json
import time
from sqladb import Link
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import urllib.robotparser
import urllib.parse
from sqlalchemy.exc import IntegrityError
try:
# connect to the DB and start an SQLAlchemy session
engine = create_engine('postgresql://postgres:my.password.123@localhost:5432/webcrawl')
engine.connect()
Session = sessionmaker(bind=engine)
session = Session()
while True:
# Get the links from the DB that are ready to be crawled
for link in session.query(Link).filter(Link.status == 0):
crawler = SeleniumCrawler()
try:
browser = crawler.navigate(link.url)
except Exception as ex:
print(str(ex))
continue # go get the next link from the database
crawler.screenshot()
# Get the visible URLS via jQuery
response = crawler.run_js('get_urls.js')
urls = response['urls']
for url in urls:
url = url.strip()
if url == '' or '#' in url:
continue # avoid URLs without an href attribute, or with targets on the page (#)
parsed_url = urllib.parse.urlparse(url)
if parsed_url.hostname is None:
url = urllib.parse.urljoin(link.url, url)
p = urllib.parse.urlparse(url)
if p.scheme not in ('http', 'https'): # we're only going to crawl http or https
continue
# Find the robots.txt so we know what to avoid
robots_url = p.scheme + '://' + p.hostname
if p.port:
robots_url += p.port
robots_url += '/robots.txt'
try:
rp = urllib.robotparser.RobotFileParser()
rp.set_url(robots_url)
rp.read()
if not rp.can_fetch("*", url):
continue
except Exception as ex:
err = ex
# add the link we found to the DB so we can crawl it
add_link = Link()
add_link.url = url
session.add(add_link)
try:
session.commit()
except IntegrityError as ex:
session.rollback() #URL already exists in the database
# update the status of the link we just successfully crawled
link.status = 1
session.commit()
time.sleep(1)
time.sleep(1)
browser.quit()
except Exception as ex:
print(str(ex))
pass
(function($, _) {
var urls = [];
$('a:visible').each(function() { //Note: visibility: hidden and opacity: 0 are considered 'visible'
var url = $(this).attr('href');
if(url) {
urls.push(url);
}
});
window.WebScrapeNS.data = {urls: urls};
})(window.WebScrapeNS.$, window.WebScrapeNS._);
Leave a comment