Basic Web Crawler using Python, Selenium, and PhantomJS

Jun 5, 2015 06:31

Below is code that helps find visible links on web pages, and then follow those links. The code is still very rough at this point, and is meant to perform a simple demonstration of PhantomJS functionality.

Note: This article is still being written and is currently incomplete.

from selenium_crawler import SeleniumCrawler
from selenium.webdriver.common.keys import Keys
import json
import time
from sqladb import Link
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import urllib.robotparser
import urllib.parse
from sqlalchemy.exc import IntegrityError

try:
    # connect to the DB and start an SQLAlchemy session
    engine = create_engine('postgresql://postgres:my.password.123@localhost:5432/webcrawl')
    engine.connect()
    Session = sessionmaker(bind=engine)
    session = Session()

    while True:
        # Get the links from the DB that are ready to be crawled
        for link in session.query(Link).filter(Link.status == 0):
            crawler = SeleniumCrawler()

            try:
                browser = crawler.navigate(link.url)
            except Exception as ex:
                print(str(ex))
                continue  # go get the next link from the database

            crawler.screenshot()

            # Get the visible URLS via jQuery
            response = crawler.run_js('get_urls.js')
            urls = response['urls']

            for url in urls:
                url = url.strip()
                if url == '' or '#' in url:
                    continue  # avoid URLs without an href attribute, or with targets on the page (#)

                parsed_url = urllib.parse.urlparse(url)

                if parsed_url.hostname is None:
                    url = urllib.parse.urljoin(link.url, url)

                p = urllib.parse.urlparse(url)

                if p.scheme not in ('http', 'https'): # we're only going to crawl http or https
                    continue

                # Find the robots.txt so we know what to avoid
                robots_url = p.scheme + '://' + p.hostname

                if p.port:
                    robots_url += p.port

                robots_url += '/robots.txt'

                try:
                    rp = urllib.robotparser.RobotFileParser()
                    rp.set_url(robots_url)
                    rp.read()

                    if not rp.can_fetch("*", url):
                        continue
                except Exception as ex:
                    err = ex

                # add the link we found to the DB so we can crawl it
                add_link = Link()
                add_link.url = url
                session.add(add_link)

                try:
                    session.commit()
                except IntegrityError as ex:
                    session.rollback() #URL already exists in the database

            # update the status of the link we just successfully crawled
            link.status = 1
            session.commit()

            time.sleep(1)
        time.sleep(1)

    browser.quit()
except Exception as ex:
    print(str(ex))
    pass

JavaScript to Get URLs for Visible Links

(function($, _) {
    var urls = [];

    $('a:visible').each(function() { //Note: visibility: hidden and opacity: 0 are considered 'visible'
        var url = $(this).attr('href');

        if(url) {
            urls.push(url);
        }
    });
    window.WebScrapeNS.data = {urls: urls};

})(window.WebScrapeNS.$, window.WebScrapeNS._);

Comments

What color are green eyes? (spam prevention)

Submit

Basic Web Crawler using Python, Selenium, and PhantomJS

JavaScript to Get URLs for Visible Links

Comments

Leave a comment