Add some basic Web scraping logic

Can now pull basic account balance and transaction information.
2015-05-11 19:16:56 -04:00 · 2015-05-11 19:16:56 -04:00 · a64b8b2c41
commit a64b8b2c41
parent b05785ccdb
3 changed files with 177 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 __pycache__
 ghostdriver.log
 *.pyc
--- a/scrapers/bank_of_america.py
+++ b/scrapers/bank_of_america.py
@ -0,0 +1,95 @@
 """Scraper implementation for Bank of America."""
 from getpass import getpass
 from selenium import webdriver
 from selenium.common.exceptions import NoSuchElementException
 from bs4 import BeautifulSoup
 from common import BankWebAuthenticator, BankScraper
 class BankOfAmericaWebAuthenticator(BankWebAuthenticator):
    """Logs a user in using the two-step form currently provided by BoA.
    This will be replaced at some point with a single sign-in form
    according to the BoA website.
    Currently, we deal with the "Verify your Identity" page by parsing
    the question and prompting the user.
    """
    def login(self, driver, credentials):
        username, password = credentials
        driver.get("https://bankofamerica.com")
        driver.find_element_by_id("id").send_keys(username)
        driver.find_element_by_id("hp-sign-in-btn").click()
        try:
            driver.find_element_by_id("tlpvt-passcode-input").send_keys(password)
            driver.find_element_by_id("passcode-confirm-sk-submit").click()
        except NoSuchElementException:
            # Prompt user for challenge page
            soup = BeautifulSoup(driver.page_source)
            prompt = soup.select('label[for=tlpvt-challenge-answer]')[0].text
            answer = input(prompt.strip())
            driver.find_element_by_id("tlpvt-challenge-answer").send_keys(answer)
            driver.find_element_by_id("verify-cq-submit").click()
        return "Your request can't be completed:" not in driver.page_source
 class BankOfAmericaBankScraper(BankScraper):
    ACCOUNTS_URL = ("https://secure.bankofamerica.com/myaccounts/brain/"
                    "redirect.go?target=accountsoverview&request_locale=en-us")
    def __init__(self, driver):
        authenticator = BankOfAmericaWebAuthenticator()
        super(BankOfAmericaBankScraper, self).__init__(driver, authenticator)
    def get_accounts(self):
        self.driver.get(self.ACCOUNTS_URL)
        soup = BeautifulSoup(self.driver.page_source)
        names = [e.text.strip() for e in soup.find_all(class_='image-account')]
        balances = [e.text.strip() for e in soup.find_all(class_='TL_NPI_L1')]
        accounts = [
            {
                'name': name,
                'balance': balance
            }
            for name, balance in zip(names, balances)
        ]
        return accounts
    def get_transactions_for_account(self, account):
        name = account['name']
        self.driver.get(self.ACCOUNTS_URL)
        self.driver.find_element_by_id(name).click()
        soup = BeautifulSoup(self.driver.page_source)
        rows = soup.select('.transaction-records tr')
        transactions = [self._tr_to_transaction(row) for row in rows]
        account['transactions'] = [e for e in transactions if e]  # filter None
        return account
    def _tr_to_transaction(self, tr):
        try:
            date = tr.select('.date-action span')[0].text.strip()
            description = tr.select('.transTitleForEditDesc')[0].text.strip()
            amount = tr.select('.amount')[0].text.strip()
            return {
                'date': date,
                'description': description,
                'amount': amount
            }
        except:
            return None
 def main():
    driver = webdriver.PhantomJS()
    credentials = (input("username: "), getpass("password: "))
    scraper = BankOfAmericaBankScraper(driver)
    print(scraper.get_data(credentials))
 if __name__ == '__main__':
    main()
--- a/scrapers/common.py
+++ b/scrapers/common.py
@ -0,0 +1,79 @@
 """Module for bank website scraping interfaces and base classes."""
 from selenium.webdriver.remote.webdriver import WebDriver
 class BankWebAuthenticator(object):
    def login(self, driver, credentials):
        """Logs a user in using the given credentials.
        :return: Whether the login was successful
        """
        raise NotImplementedError("Must extend BankWebAuthenticator")
 class BankScraper(object):
    """Generic interface for a Web scraper that pulls information from
    bank websites."""
    def __init__(self, driver, authenticator):
        """Initializes the BankScraper
        :param driver: A Selenium web driver
        :param authenticator: A BankWebAuthenticator
        """
        assert isinstance(authenticator, BankWebAuthenticator)
        self.driver = driver
        self.authenticator = authenticator
    def get_data(self, credentials, refresh=False):
        """Returns some data structure with the information parsed.
        Locally caches if possible.
        :param credentials: Credentials for the authenticator
            two-tuple of (username ,password)
        :param refresh: Forces the scraper to ignore local cache
        :return: The data retrieved
        """
        if self.authenticator.login(self.driver, credentials):
            accounts = self.get_accounts()
            return {
                'accounts': self.get_transactions(accounts)
            }
        else:
            # should maybe raise an exception here instead of silently
            # failing
            return []
    def get_accounts(self):
        """Retrieves account information such as bank balance
        :return: List of dicts that contain the keys 'name' and
            'balance'.
        """
        raise NotImplementedError("Must extend BankScraper")
    def get_transactions(self, accounts):
        """Gets the transactions associated with each account.
        :param accounts: List of dicts with the key 'name'
        :return: List of the same dicts but with a new key:
            'transactions', which is itself a list of dicts
        """
        return [
            self.get_transactions_for_account(account)
            for account in accounts
        ]
    def get_transactions_for_account(self, account):
        """Gets the transactions for one account.
        :param: dict with key 'name'
        :return: dict with keys 'name' and 'transactions', whose value
            is a list of the transactions for this account
        """
        raise NotImplementedError("Must extend BankScraper")