Below is code that helps find visible links on web pages, and then follow those links. The code is still very rough at this point, and is meant to perform a simple demonstration of PhantomJS functionality.
Note: This article is still being written and is currently incomplete.
from selenium_crawler import SeleniumCrawler
from selenium.webdriver.common.keys import Keys
import json
import time
from sqladb import Link
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import urllib.robotparser
import urllib.parse
from sqlalchemy.exc import IntegrityError
try:
    # connect to the DB and start an SQLAlchemy session
    engine = create_engine('postgresql://postgres:my.password.123@localhost:5432/webcrawl')
    engine.connect()
    Session = sessionmaker(bind=engine)
    session = Session()
    while True:
        # Get the links from the DB that are ready to be crawled
        for link in session.query(Link).filter(Link.status == 0):
            crawler = SeleniumCrawler()
            try:
                browser = crawler.navigate(link.url)
            except Exception as ex:
                print(str(ex))
                continue  # go get the next link from the database
            crawler.screenshot()
            # Get the visible URLS via jQuery
            response = crawler.run_js('get_urls.js')
            urls = response['urls']
            for url in urls:
                url = url.strip()
                if url == '' or '#' in url:
                    continue  # avoid URLs without an href attribute, or with targets on the page (#)
                parsed_url = urllib.parse.urlparse(url)
                if parsed_url.hostname is None:
                    url = urllib.parse.urljoin(link.url, url)
                p = urllib.parse.urlparse(url)
                if p.scheme not in ('http', 'https'): # we're only going to crawl http or https
                    continue
                # Find the robots.txt so we know what to avoid
                robots_url = p.scheme + '://' + p.hostname
                if p.port:
                    robots_url += p.port
                robots_url += '/robots.txt'
                try:
                    rp = urllib.robotparser.RobotFileParser()
                    rp.set_url(robots_url)
                    rp.read()
                    if not rp.can_fetch("*", url):
                        continue
                except Exception as ex:
                    err = ex
                # add the link we found to the DB so we can crawl it
                add_link = Link()
                add_link.url = url
                session.add(add_link)
                try:
                    session.commit()
                except IntegrityError as ex:
                    session.rollback() #URL already exists in the database
            # update the status of the link we just successfully crawled
            link.status = 1
            session.commit()
            time.sleep(1)
        time.sleep(1)
    browser.quit()
except Exception as ex:
    print(str(ex))
    pass
(function($, _) {
    var urls = [];
    $('a:visible').each(function() { //Note: visibility: hidden and opacity: 0 are considered 'visible'
        var url = $(this).attr('href');
        if(url) {
            urls.push(url);
        }
    });
    window.WebScrapeNS.data = {urls: urls};
})(window.WebScrapeNS.$, window.WebScrapeNS._);
Leave a comment