From a64b8b2c41e07b25f01fabd3013fe88ca0029bb4 Mon Sep 17 00:00:00 2001 From: Ian Adam Naval Date: Mon, 11 May 2015 19:16:56 -0400 Subject: [PATCH] Add some basic Web scraping logic Can now pull basic account balance and transaction information. --- .gitignore | 3 ++ scrapers/bank_of_america.py | 95 +++++++++++++++++++++++++++++++++++++ scrapers/common.py | 79 ++++++++++++++++++++++++++++++ 3 files changed, 177 insertions(+) create mode 100644 .gitignore create mode 100644 scrapers/bank_of_america.py create mode 100644 scrapers/common.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..65f614b --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +__pycache__ +ghostdriver.log +*.pyc diff --git a/scrapers/bank_of_america.py b/scrapers/bank_of_america.py new file mode 100644 index 0000000..3dfaf25 --- /dev/null +++ b/scrapers/bank_of_america.py @@ -0,0 +1,95 @@ +"""Scraper implementation for Bank of America.""" + +from getpass import getpass + +from selenium import webdriver +from selenium.common.exceptions import NoSuchElementException +from bs4 import BeautifulSoup + +from common import BankWebAuthenticator, BankScraper + + +class BankOfAmericaWebAuthenticator(BankWebAuthenticator): + """Logs a user in using the two-step form currently provided by BoA. + This will be replaced at some point with a single sign-in form + according to the BoA website. + + Currently, we deal with the "Verify your Identity" page by parsing + the question and prompting the user. + """ + + def login(self, driver, credentials): + username, password = credentials + driver.get("https://bankofamerica.com") + driver.find_element_by_id("id").send_keys(username) + driver.find_element_by_id("hp-sign-in-btn").click() + try: + driver.find_element_by_id("tlpvt-passcode-input").send_keys(password) + driver.find_element_by_id("passcode-confirm-sk-submit").click() + except NoSuchElementException: + # Prompt user for challenge page + soup = BeautifulSoup(driver.page_source) + prompt = soup.select('label[for=tlpvt-challenge-answer]')[0].text + answer = input(prompt.strip()) + driver.find_element_by_id("tlpvt-challenge-answer").send_keys(answer) + driver.find_element_by_id("verify-cq-submit").click() + return "Your request can't be completed:" not in driver.page_source + + +class BankOfAmericaBankScraper(BankScraper): + + ACCOUNTS_URL = ("https://secure.bankofamerica.com/myaccounts/brain/" + "redirect.go?target=accountsoverview&request_locale=en-us") + + def __init__(self, driver): + authenticator = BankOfAmericaWebAuthenticator() + super(BankOfAmericaBankScraper, self).__init__(driver, authenticator) + + def get_accounts(self): + self.driver.get(self.ACCOUNTS_URL) + soup = BeautifulSoup(self.driver.page_source) + names = [e.text.strip() for e in soup.find_all(class_='image-account')] + balances = [e.text.strip() for e in soup.find_all(class_='TL_NPI_L1')] + accounts = [ + { + 'name': name, + 'balance': balance + } + for name, balance in zip(names, balances) + ] + return accounts + + def get_transactions_for_account(self, account): + name = account['name'] + self.driver.get(self.ACCOUNTS_URL) + self.driver.find_element_by_id(name).click() + soup = BeautifulSoup(self.driver.page_source) + rows = soup.select('.transaction-records tr') + transactions = [self._tr_to_transaction(row) for row in rows] + account['transactions'] = [e for e in transactions if e] # filter None + return account + + def _tr_to_transaction(self, tr): + try: + date = tr.select('.date-action span')[0].text.strip() + description = tr.select('.transTitleForEditDesc')[0].text.strip() + amount = tr.select('.amount')[0].text.strip() + return { + 'date': date, + 'description': description, + 'amount': amount + } + except: + return None + + + +def main(): + driver = webdriver.PhantomJS() + credentials = (input("username: "), getpass("password: ")) + scraper = BankOfAmericaBankScraper(driver) + print(scraper.get_data(credentials)) + + +if __name__ == '__main__': + main() diff --git a/scrapers/common.py b/scrapers/common.py new file mode 100644 index 0000000..4cfa670 --- /dev/null +++ b/scrapers/common.py @@ -0,0 +1,79 @@ +"""Module for bank website scraping interfaces and base classes.""" + +from selenium.webdriver.remote.webdriver import WebDriver + + +class BankWebAuthenticator(object): + + def login(self, driver, credentials): + """Logs a user in using the given credentials. + + :return: Whether the login was successful + """ + raise NotImplementedError("Must extend BankWebAuthenticator") + + +class BankScraper(object): + """Generic interface for a Web scraper that pulls information from + bank websites.""" + + def __init__(self, driver, authenticator): + """Initializes the BankScraper + + :param driver: A Selenium web driver + :param authenticator: A BankWebAuthenticator + """ + assert isinstance(authenticator, BankWebAuthenticator) + self.driver = driver + self.authenticator = authenticator + + def get_data(self, credentials, refresh=False): + """Returns some data structure with the information parsed. + Locally caches if possible. + + :param credentials: Credentials for the authenticator + two-tuple of (username ,password) + :param refresh: Forces the scraper to ignore local cache + + :return: The data retrieved + """ + if self.authenticator.login(self.driver, credentials): + accounts = self.get_accounts() + return { + 'accounts': self.get_transactions(accounts) + } + else: + # should maybe raise an exception here instead of silently + # failing + return [] + + def get_accounts(self): + """Retrieves account information such as bank balance + + :return: List of dicts that contain the keys 'name' and + 'balance'. + """ + raise NotImplementedError("Must extend BankScraper") + + def get_transactions(self, accounts): + """Gets the transactions associated with each account. + + :param accounts: List of dicts with the key 'name' + + :return: List of the same dicts but with a new key: + 'transactions', which is itself a list of dicts + """ + return [ + self.get_transactions_for_account(account) + for account in accounts + ] + + def get_transactions_for_account(self, account): + """Gets the transactions for one account. + + :param: dict with key 'name' + + :return: dict with keys 'name' and 'transactions', whose value + is a list of the transactions for this account + """ + raise NotImplementedError("Must extend BankScraper")