Basic Web Crawler using Python, Selenium, and PhantomJS

Below is code that helps find visible links on web pages, and then follow those links. The code is still very rough at this point, and is meant to perform a simple demonstration of PhantomJS functionality.

Note: This article is still being written and is currently incomplete.

from selenium_crawler import SeleniumCrawler
from selenium.webdriver.common.keys import Keys
import json
import time
from sqladb import Link
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import urllib.robotparser
import urllib.parse
from sqlalchemy.exc import IntegrityError

try:
    # connect to the DB and start an SQLAlchemy session
    engine = create_engine('postgresql://postgres:my.password.123@localhost:5432/webcrawl')
    engine.connect()
    Session = sessionmaker(bind=engine)
    session = Session()

    while True:
        # Get the links from the DB that are ready to be crawled
        for link in session.query(Link).filter(Link.status == 0):
            crawler = SeleniumCrawler()

            try:
                browser = crawler.navigate(link.url)
            except Exception as ex:
                print(str(ex))
                continue  # go get the next link from the database

            crawler.screenshot()

            # Get the visible URLS via jQuery
            response = crawler.run_js('get_urls.js')
            urls = response['urls']

            for url in urls:
                url = url.strip()
                if url == '' or '#' in url:
                    continue  # avoid URLs without an href attribute, or with targets on the page (#)

                parsed_url = urllib.parse.urlparse(url)

                if parsed_url.hostname is None:
                    url = urllib.parse.urljoin(link.url, url)

                p = urllib.parse.urlparse(url)

                if p.scheme not in ('http', 'https'): # we're only going to crawl http or https
                    continue

                # Find the robots.txt so we know what to avoid
                robots_url = p.scheme + '://' + p.hostname

                if p.port:
                    robots_url += p.port

                robots_url += '/robots.txt'

                try:
                    rp = urllib.robotparser.RobotFileParser()
                    rp.set_url(robots_url)
                    rp.read()

                    if not rp.can_fetch("*", url):
                        continue
                except Exception as ex:
                    err = ex

                # add the link we found to the DB so we can crawl it
                add_link = Link()
                add_link.url = url
                session.add(add_link)

                try:
                    session.commit()
                except IntegrityError as ex:
                    session.rollback() #URL already exists in the database

            # update the status of the link we just successfully crawled
            link.status = 1
            session.commit()

            time.sleep(1)
        time.sleep(1)

    browser.quit()
except Exception as ex:
    print(str(ex))
    pass

JavaScript to Get URLs for Visible Links

(function($, _) {
    var urls = [];

    $('a:visible').each(function() { //Note: visibility: hidden and opacity: 0 are considered 'visible'
        var url = $(this).attr('href');

        if(url) {
            urls.push(url);
        }
    });
    window.WebScrapeNS.data = {urls: urls};

})(window.WebScrapeNS.$, window.WebScrapeNS._);

Comments

Leave a comment

What color are green eyes? (spam prevention)
Submit
Code under MIT License unless otherwise indicated.
© 2020, Downranked, LLC.