Add some basic Web scraping logic
Can now pull basic account balance and transaction information.
This commit is contained in:
parent
b05785ccdb
commit
a64b8b2c41
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
__pycache__
|
||||||
|
ghostdriver.log
|
||||||
|
*.pyc
|
||||||
95
scrapers/bank_of_america.py
Normal file
95
scrapers/bank_of_america.py
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
"""Scraper implementation for Bank of America."""
|
||||||
|
|
||||||
|
from getpass import getpass
|
||||||
|
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.common.exceptions import NoSuchElementException
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from common import BankWebAuthenticator, BankScraper
|
||||||
|
|
||||||
|
|
||||||
|
class BankOfAmericaWebAuthenticator(BankWebAuthenticator):
|
||||||
|
"""Logs a user in using the two-step form currently provided by BoA.
|
||||||
|
This will be replaced at some point with a single sign-in form
|
||||||
|
according to the BoA website.
|
||||||
|
|
||||||
|
Currently, we deal with the "Verify your Identity" page by parsing
|
||||||
|
the question and prompting the user.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def login(self, driver, credentials):
|
||||||
|
username, password = credentials
|
||||||
|
driver.get("https://bankofamerica.com")
|
||||||
|
driver.find_element_by_id("id").send_keys(username)
|
||||||
|
driver.find_element_by_id("hp-sign-in-btn").click()
|
||||||
|
try:
|
||||||
|
driver.find_element_by_id("tlpvt-passcode-input").send_keys(password)
|
||||||
|
driver.find_element_by_id("passcode-confirm-sk-submit").click()
|
||||||
|
except NoSuchElementException:
|
||||||
|
# Prompt user for challenge page
|
||||||
|
soup = BeautifulSoup(driver.page_source)
|
||||||
|
prompt = soup.select('label[for=tlpvt-challenge-answer]')[0].text
|
||||||
|
answer = input(prompt.strip())
|
||||||
|
driver.find_element_by_id("tlpvt-challenge-answer").send_keys(answer)
|
||||||
|
driver.find_element_by_id("verify-cq-submit").click()
|
||||||
|
return "Your request can't be completed:" not in driver.page_source
|
||||||
|
|
||||||
|
|
||||||
|
class BankOfAmericaBankScraper(BankScraper):
|
||||||
|
|
||||||
|
ACCOUNTS_URL = ("https://secure.bankofamerica.com/myaccounts/brain/"
|
||||||
|
"redirect.go?target=accountsoverview&request_locale=en-us")
|
||||||
|
|
||||||
|
def __init__(self, driver):
|
||||||
|
authenticator = BankOfAmericaWebAuthenticator()
|
||||||
|
super(BankOfAmericaBankScraper, self).__init__(driver, authenticator)
|
||||||
|
|
||||||
|
def get_accounts(self):
|
||||||
|
self.driver.get(self.ACCOUNTS_URL)
|
||||||
|
soup = BeautifulSoup(self.driver.page_source)
|
||||||
|
names = [e.text.strip() for e in soup.find_all(class_='image-account')]
|
||||||
|
balances = [e.text.strip() for e in soup.find_all(class_='TL_NPI_L1')]
|
||||||
|
accounts = [
|
||||||
|
{
|
||||||
|
'name': name,
|
||||||
|
'balance': balance
|
||||||
|
}
|
||||||
|
for name, balance in zip(names, balances)
|
||||||
|
]
|
||||||
|
return accounts
|
||||||
|
|
||||||
|
def get_transactions_for_account(self, account):
|
||||||
|
name = account['name']
|
||||||
|
self.driver.get(self.ACCOUNTS_URL)
|
||||||
|
self.driver.find_element_by_id(name).click()
|
||||||
|
soup = BeautifulSoup(self.driver.page_source)
|
||||||
|
rows = soup.select('.transaction-records tr')
|
||||||
|
transactions = [self._tr_to_transaction(row) for row in rows]
|
||||||
|
account['transactions'] = [e for e in transactions if e] # filter None
|
||||||
|
return account
|
||||||
|
|
||||||
|
def _tr_to_transaction(self, tr):
|
||||||
|
try:
|
||||||
|
date = tr.select('.date-action span')[0].text.strip()
|
||||||
|
description = tr.select('.transTitleForEditDesc')[0].text.strip()
|
||||||
|
amount = tr.select('.amount')[0].text.strip()
|
||||||
|
return {
|
||||||
|
'date': date,
|
||||||
|
'description': description,
|
||||||
|
'amount': amount
|
||||||
|
}
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
driver = webdriver.PhantomJS()
|
||||||
|
credentials = (input("username: "), getpass("password: "))
|
||||||
|
scraper = BankOfAmericaBankScraper(driver)
|
||||||
|
print(scraper.get_data(credentials))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
79
scrapers/common.py
Normal file
79
scrapers/common.py
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
"""Module for bank website scraping interfaces and base classes."""
|
||||||
|
|
||||||
|
from selenium.webdriver.remote.webdriver import WebDriver
|
||||||
|
|
||||||
|
|
||||||
|
class BankWebAuthenticator(object):
|
||||||
|
|
||||||
|
def login(self, driver, credentials):
|
||||||
|
"""Logs a user in using the given credentials.
|
||||||
|
|
||||||
|
:return: Whether the login was successful
|
||||||
|
"""
|
||||||
|
raise NotImplementedError("Must extend BankWebAuthenticator")
|
||||||
|
|
||||||
|
|
||||||
|
class BankScraper(object):
|
||||||
|
"""Generic interface for a Web scraper that pulls information from
|
||||||
|
bank websites."""
|
||||||
|
|
||||||
|
def __init__(self, driver, authenticator):
|
||||||
|
"""Initializes the BankScraper
|
||||||
|
|
||||||
|
:param driver: A Selenium web driver
|
||||||
|
:param authenticator: A BankWebAuthenticator
|
||||||
|
"""
|
||||||
|
assert isinstance(authenticator, BankWebAuthenticator)
|
||||||
|
self.driver = driver
|
||||||
|
self.authenticator = authenticator
|
||||||
|
|
||||||
|
def get_data(self, credentials, refresh=False):
|
||||||
|
"""Returns some data structure with the information parsed.
|
||||||
|
Locally caches if possible.
|
||||||
|
|
||||||
|
:param credentials: Credentials for the authenticator
|
||||||
|
two-tuple of (username ,password)
|
||||||
|
:param refresh: Forces the scraper to ignore local cache
|
||||||
|
|
||||||
|
:return: The data retrieved
|
||||||
|
"""
|
||||||
|
if self.authenticator.login(self.driver, credentials):
|
||||||
|
accounts = self.get_accounts()
|
||||||
|
return {
|
||||||
|
'accounts': self.get_transactions(accounts)
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
# should maybe raise an exception here instead of silently
|
||||||
|
# failing
|
||||||
|
return []
|
||||||
|
|
||||||
|
def get_accounts(self):
|
||||||
|
"""Retrieves account information such as bank balance
|
||||||
|
|
||||||
|
:return: List of dicts that contain the keys 'name' and
|
||||||
|
'balance'.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError("Must extend BankScraper")
|
||||||
|
|
||||||
|
def get_transactions(self, accounts):
|
||||||
|
"""Gets the transactions associated with each account.
|
||||||
|
|
||||||
|
:param accounts: List of dicts with the key 'name'
|
||||||
|
|
||||||
|
:return: List of the same dicts but with a new key:
|
||||||
|
'transactions', which is itself a list of dicts
|
||||||
|
"""
|
||||||
|
return [
|
||||||
|
self.get_transactions_for_account(account)
|
||||||
|
for account in accounts
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_transactions_for_account(self, account):
|
||||||
|
"""Gets the transactions for one account.
|
||||||
|
|
||||||
|
:param: dict with key 'name'
|
||||||
|
|
||||||
|
:return: dict with keys 'name' and 'transactions', whose value
|
||||||
|
is a list of the transactions for this account
|
||||||
|
"""
|
||||||
|
raise NotImplementedError("Must extend BankScraper")
|
||||||
Loading…
x
Reference in New Issue
Block a user