Add some basic Web scraping logic

Can now pull basic account balance and transaction information.
This commit is contained in:
Ian Adam Naval 2015-05-11 19:16:56 -04:00
parent b05785ccdb
commit a64b8b2c41
3 changed files with 177 additions and 0 deletions

3
.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
__pycache__
ghostdriver.log
*.pyc

View File

@ -0,0 +1,95 @@
"""Scraper implementation for Bank of America."""
from getpass import getpass
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
from common import BankWebAuthenticator, BankScraper
class BankOfAmericaWebAuthenticator(BankWebAuthenticator):
"""Logs a user in using the two-step form currently provided by BoA.
This will be replaced at some point with a single sign-in form
according to the BoA website.
Currently, we deal with the "Verify your Identity" page by parsing
the question and prompting the user.
"""
def login(self, driver, credentials):
username, password = credentials
driver.get("https://bankofamerica.com")
driver.find_element_by_id("id").send_keys(username)
driver.find_element_by_id("hp-sign-in-btn").click()
try:
driver.find_element_by_id("tlpvt-passcode-input").send_keys(password)
driver.find_element_by_id("passcode-confirm-sk-submit").click()
except NoSuchElementException:
# Prompt user for challenge page
soup = BeautifulSoup(driver.page_source)
prompt = soup.select('label[for=tlpvt-challenge-answer]')[0].text
answer = input(prompt.strip())
driver.find_element_by_id("tlpvt-challenge-answer").send_keys(answer)
driver.find_element_by_id("verify-cq-submit").click()
return "Your request can't be completed:" not in driver.page_source
class BankOfAmericaBankScraper(BankScraper):
ACCOUNTS_URL = ("https://secure.bankofamerica.com/myaccounts/brain/"
"redirect.go?target=accountsoverview&request_locale=en-us")
def __init__(self, driver):
authenticator = BankOfAmericaWebAuthenticator()
super(BankOfAmericaBankScraper, self).__init__(driver, authenticator)
def get_accounts(self):
self.driver.get(self.ACCOUNTS_URL)
soup = BeautifulSoup(self.driver.page_source)
names = [e.text.strip() for e in soup.find_all(class_='image-account')]
balances = [e.text.strip() for e in soup.find_all(class_='TL_NPI_L1')]
accounts = [
{
'name': name,
'balance': balance
}
for name, balance in zip(names, balances)
]
return accounts
def get_transactions_for_account(self, account):
name = account['name']
self.driver.get(self.ACCOUNTS_URL)
self.driver.find_element_by_id(name).click()
soup = BeautifulSoup(self.driver.page_source)
rows = soup.select('.transaction-records tr')
transactions = [self._tr_to_transaction(row) for row in rows]
account['transactions'] = [e for e in transactions if e] # filter None
return account
def _tr_to_transaction(self, tr):
try:
date = tr.select('.date-action span')[0].text.strip()
description = tr.select('.transTitleForEditDesc')[0].text.strip()
amount = tr.select('.amount')[0].text.strip()
return {
'date': date,
'description': description,
'amount': amount
}
except:
return None
def main():
driver = webdriver.PhantomJS()
credentials = (input("username: "), getpass("password: "))
scraper = BankOfAmericaBankScraper(driver)
print(scraper.get_data(credentials))
if __name__ == '__main__':
main()

79
scrapers/common.py Normal file
View File

@ -0,0 +1,79 @@
"""Module for bank website scraping interfaces and base classes."""
from selenium.webdriver.remote.webdriver import WebDriver
class BankWebAuthenticator(object):
def login(self, driver, credentials):
"""Logs a user in using the given credentials.
:return: Whether the login was successful
"""
raise NotImplementedError("Must extend BankWebAuthenticator")
class BankScraper(object):
"""Generic interface for a Web scraper that pulls information from
bank websites."""
def __init__(self, driver, authenticator):
"""Initializes the BankScraper
:param driver: A Selenium web driver
:param authenticator: A BankWebAuthenticator
"""
assert isinstance(authenticator, BankWebAuthenticator)
self.driver = driver
self.authenticator = authenticator
def get_data(self, credentials, refresh=False):
"""Returns some data structure with the information parsed.
Locally caches if possible.
:param credentials: Credentials for the authenticator
two-tuple of (username ,password)
:param refresh: Forces the scraper to ignore local cache
:return: The data retrieved
"""
if self.authenticator.login(self.driver, credentials):
accounts = self.get_accounts()
return {
'accounts': self.get_transactions(accounts)
}
else:
# should maybe raise an exception here instead of silently
# failing
return []
def get_accounts(self):
"""Retrieves account information such as bank balance
:return: List of dicts that contain the keys 'name' and
'balance'.
"""
raise NotImplementedError("Must extend BankScraper")
def get_transactions(self, accounts):
"""Gets the transactions associated with each account.
:param accounts: List of dicts with the key 'name'
:return: List of the same dicts but with a new key:
'transactions', which is itself a list of dicts
"""
return [
self.get_transactions_for_account(account)
for account in accounts
]
def get_transactions_for_account(self, account):
"""Gets the transactions for one account.
:param: dict with key 'name'
:return: dict with keys 'name' and 'transactions', whose value
is a list of the transactions for this account
"""
raise NotImplementedError("Must extend BankScraper")