From e04bb0edc47b3b6a6a42d1a0c9845afe26c6f6f2 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Thu, 18 Jun 2020 03:25:55 +0000 Subject: [PATCH 01/19] initial commit-adding presto example --- notebooks/mysql_init_db.ipynb | 594 ++++++++++++++++++++++++++++++++ presto_example/README.md | 23 ++ presto_example/mysql.cnf_LOCAL | 4 + presto_example/mysql_example.py | 344 ++++++++++++++++++ presto_example/mysql_init_db.py | 234 +++++++++++++ presto_example/requirements.txt | 1 + 6 files changed, 1200 insertions(+) create mode 100644 notebooks/mysql_init_db.ipynb create mode 100644 presto_example/README.md create mode 100644 presto_example/mysql.cnf_LOCAL create mode 100644 presto_example/mysql_example.py create mode 100644 presto_example/mysql_init_db.py create mode 100644 presto_example/requirements.txt diff --git a/notebooks/mysql_init_db.ipynb b/notebooks/mysql_init_db.ipynb new file mode 100644 index 00000000..1dd956d9 --- /dev/null +++ b/notebooks/mysql_init_db.ipynb @@ -0,0 +1,594 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "athena_garbage = 's3://com.ria.scratch/athena_garbage/'\n", + "bucket='com.ria.scratch'\n", + "region='eu-west-1'\n", + "workgroup = 'RIA'\n", + "root_key='as-dedupe/'" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# %load ../mysql_example/mysql_init_db.py\n", + "#!/usr/bin/python\n", + "\"\"\"\n", + "This is a setup script for mysql_example. It downloads a zip file of\n", + "Illinois campaign contributions and loads them into a MySQL database\n", + "named 'contributions'.\n", + " \n", + "__Note:__ You will need to run this script first before execuing\n", + "[mysql_example.py](mysql_example.html).\n", + " \n", + "Tables created:\n", + "* raw_table - raw import of entire CSV file\n", + "* donors - all distinct donors based on name and address\n", + "* recipients - all distinct campaign contribution recipients\n", + "* contributions - contribution amounts tied to donor and recipients tables\n", + "\"\"\"\n", + "\n", + "import os\n", + "import zipfile\n", + "import warnings\n", + "import pandas as pd\n", + "import numpy as np\n", + "from urllib.request import urlopen\n", + "import boto3\n", + "from pyathena import connect\n", + "\n", + "# import MySQLdb\n", + "\n", + "# warnings.filterwarnings('ignore', category=MySQLdb.Warning)\n", + "\n", + "contributions_zip_file = 'Illinois-campaign-contributions.txt.zip'\n", + "contributions_txt_file = 'Illinois-campaign-contributions.txt'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists(contributions_zip_file) :\n", + " print('downloading', contributions_zip_file, '(~60mb) ...')\n", + " u = urlopen('https://s3.amazonaws.com/dedupe-data/Illinois-campaign-contributions.txt.zip')\n", + " localFile = open(contributions_zip_file, 'wb')\n", + " localFile.write(u.read())\n", + " localFile.close()\n", + "\n", + "if not os.path.exists(contributions_txt_file) :\n", + " zip_file = zipfile.ZipFile(contributions_zip_file, 'r')\n", + " print('extracting %s' % contributions_zip_file)\n", + " zip_file_contents = zip_file.namelist()\n", + " for f in zip_file_contents:\n", + " if ('.txt' in f):\n", + " zip_file.extract(f)\n", + " zip_file.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# conn = MySQLdb.connect(read_default_file = os.path.abspath('.') + '/mysql.cnf', \n", + "# local_infile = 1,\n", + "# sql_mode=\"ALLOW_INVALID_DATES\",\n", + "# db='contributions')\n", + "# c = conn.cursor()\n", + "\n", + "s3 = boto3.client('s3') \n", + "conn = connect(s3_staging_dir=athena_garbage,\n", + " region_name=region, work_group=workgroup)\n", + "c = conn.cursor()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "importing raw data from csv...\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print('importing raw data from csv...')\n", + "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.raw_table\")\n", + "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.donors\")\n", + "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.recipients\")\n", + "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.contributions\")\n", + "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.processed_donors\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# c.execute(\"CREATE TABLE raw_table \"\n", + "# \"(reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), \"\n", + "# \" address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), \"\n", + "# \" state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), \"\n", + "# \" date_recieved VARCHAR(10), loan_amount VARCHAR(12), \"\n", + "# \" amount VARCHAR(23), receipt_type VARCHAR(23), \"\n", + "# \" employer VARCHAR(70), occupation VARCHAR(40), \"\n", + "# \" vendor_last_name VARCHAR(70), vendor_first_name VARCHAR(20), \"\n", + "# \" vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), \"\n", + "# \" vendor_city VARCHAR(20), vendor_state VARCHAR(10), \"\n", + "# \" vendor_zip VARCHAR(10), description VARCHAR(90), \"\n", + "# \" election_type VARCHAR(10), election_year VARCHAR(10), \"\n", + "# \" report_period_begin VARCHAR(10), report_period_end VARCHAR(33), \"\n", + "# \" committee_name VARCHAR(70), committee_id VARCHAR(37)) \"\n", + "# \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n", + "\n", + "\n", + "# conn.commit()\n", + "q=r'''\n", + "CREATE EXTERNAL TABLE ria_data_science_s3.raw_table \n", + " (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), \n", + " address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), \n", + " state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), \n", + " date_recieved VARCHAR(10), loan_amount VARCHAR(12), \n", + " amount VARCHAR(23), receipt_type VARCHAR(23), \n", + " employer VARCHAR(70), occupation VARCHAR(40), \n", + " vendor_last_name VARCHAR(70), vendor_first_name VARCHAR(20), \n", + " vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), \n", + " vendor_city VARCHAR(20), vendor_state VARCHAR(10), \n", + " vendor_zip VARCHAR(10), description VARCHAR(90), \n", + " election_type VARCHAR(10), election_year VARCHAR(10), \n", + " report_period_begin VARCHAR(10), report_period_end VARCHAR(33), \n", + " committee_name VARCHAR(70), committee_id VARCHAR(37)) \n", + "ROW FORMAT DELIMITED\n", + " FIELDS TERMINATED BY '\\t'\n", + " LINES TERMINATED BY '\\n' \n", + "LOCATION\n", + " 's3://{}/{}' \n", + "TBLPROPERTIES (\n", + " 'classification'='csv', \n", + " 'skip.header.line.count'='1', \n", + " 'serialization.null.format'='')\n", + "'''.format(bucket, root_key+'raw_table') \n", + "c.execute(q)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "b'Skipping line 1441352: expected 30 fields, saw 31\\n'\n", + "b'Skipping line 1465996: expected 30 fields, saw 31\\n'\n", + "b'Skipping line 1495732: expected 30 fields, saw 31\\n'\n", + "b'Skipping line 1631504: expected 30 fields, saw 31\\nSkipping line 1631506: expected 30 fields, saw 31\\n'\n", + "b'Skipping line 1660260: expected 30 fields, saw 31\\nSkipping line 1660264: expected 30 fields, saw 32\\n'\n" + ] + }, + { + "data": { + "text/plain": [ + "{'ResponseMetadata': {'RequestId': 'C8707997FC007A2B',\n", + " 'HostId': 'pD0pZDu7WHeyS6gGA9JAV11Ns6QUZ99Iqjskl4Pvgd2V9cxZf2ulF8azIOgJnvWQ0Tv+DSJniEw=',\n", + " 'HTTPStatusCode': 200,\n", + " 'HTTPHeaders': {'x-amz-id-2': 'pD0pZDu7WHeyS6gGA9JAV11Ns6QUZ99Iqjskl4Pvgd2V9cxZf2ulF8azIOgJnvWQ0Tv+DSJniEw=',\n", + " 'x-amz-request-id': 'C8707997FC007A2B',\n", + " 'date': 'Thu, 18 Jun 2020 03:05:54 GMT',\n", + " 'x-amz-server-side-encryption': 'AES256',\n", + " 'etag': '\"42fa5ce005b346df46ed9bb9aa8fb140\"',\n", + " 'content-length': '0',\n", + " 'server': 'AmazonS3'},\n", + " 'RetryAttempts': 0},\n", + " 'ETag': '\"42fa5ce005b346df46ed9bb9aa8fb140\"',\n", + " 'ServerSideEncryption': 'AES256'}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# c.execute(\"LOAD DATA LOCAL INFILE %s INTO TABLE raw_table \"\n", + "# \"FIELDS TERMINATED BY '\\t' LINES TERMINATED BY '\\r\\n' \" \n", + "# \"IGNORE 1 LINES \"\n", + "# \"(reciept_id, last_name, first_name, \"\n", + "# \" address_1, address_2, city, state, \"\n", + "# \" zip, report_type, date_recieved, \"\n", + "# \" loan_amount, amount, receipt_type, \"\n", + "# \" employer, occupation, vendor_last_name, \"\n", + "# \" vendor_first_name, vendor_address_1, \"\n", + "# \" vendor_address_2, vendor_city, vendor_state, \"\n", + "# \" vendor_zip, description, election_type, \"\n", + "# \" election_year, \"\n", + "# \" report_period_begin, report_period_end, \"\n", + "# \" committee_name, committee_id, @dummy)\",\n", + "# (contributions_txt_file,))\n", + "\n", + "df = pd.read_csv(contributions_txt_file, sep='\\t', error_bad_lines=False, dtype=str, index_col=0)\n", + "# Remove the very few records that mess up the demo \n", + "# (demo purposes only! Don't do something like this in production)\n", + "# c.execute(\"DELETE FROM raw_table WHERE LENGTH(date_recieved) < 10\")\n", + "df = df[df['RcvDate'].str.len()>=10]\n", + "\n", + "# set empty, non-zero, strings in date columns to null\n", + "# c.execute(\"UPDATE raw_table SET report_period_begin = NULL WHERE LENGTH(report_period_begin) < 10\")\n", + "df.loc[df['RptPdBegDate'].str.len()<10,'RptPdBegDate'] = np.nan\n", + "\n", + "# c.execute(\"UPDATE raw_table SET report_period_end = NULL WHERE LENGTH(report_period_end) < 10\")\n", + "df.loc[df['RptPdEndDate'].str.len()<10,'RptPdEndDate'] = np.nan\n", + "\n", + "#committee ID is requred. Remove the 2 rows that don't have it.\n", + "# c.execute(\"DELETE FROM raw_table WHERE committee_id=''\");\n", + "df = df[df['ID'] != '']\n", + "\n", + "# There's a record with a date stuck in the committee_id column, which causes\n", + "# problems when inserting into the contributions table below. Get rid of it this \n", + "# way.\n", + "# c.execute(\"DELETE FROM raw_table WHERE LENGTH( committee_id ) > 9\")\n", + "df = df[df['ID'].str.len() <=9]\n", + "\n", + "# Nullifying empty strings\n", + "df = df.replace(r'^\\s*$', np.nan, regex=True)\n", + "\n", + "s3.put_object(Bucket=bucket, Key=root_key+'raw_table/'+contributions_txt_file, Body=df.to_csv(sep=\"\\t\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "creating donors table...\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print('creating donors table...')\n", + "# c.execute(\"CREATE TABLE donors \"\n", + "# \"(donor_id INTEGER PRIMARY KEY AUTO_INCREMENT, \"\n", + "# \" last_name VARCHAR(70), first_name VARCHAR(35), \"\n", + "# \" address_1 VARCHAR(35), address_2 VARCHAR(36), \"\n", + "# \" city VARCHAR(20), state VARCHAR(15), \"\n", + "# \" zip VARCHAR(11), employer VARCHAR(70), \"\n", + "# \" occupation VARCHAR(40)) \"\n", + "# \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n", + "# c.execute(\"INSERT INTO donors \"\n", + "# \"(first_name, last_name, address_1,\"\n", + "# \" address_2, city, state, zip, employer, occupation) \"\n", + "# \"SELECT DISTINCT \"\n", + "# \"TRIM(first_name), TRIM(last_name), TRIM(address_1), \"\n", + "# \"TRIM(address_2), TRIM(city), TRIM(state), TRIM(zip), \"\n", + "# \"TRIM(employer), TRIM(occupation) \"\n", + "# \"FROM raw_table\")\n", + "# conn.commit()\n", + "q='''\n", + "CREATE TABLE ria_data_science_s3.donors as\n", + " with tmp as\n", + " (SELECT DISTINCT \n", + " TRIM(last_name) as last_name, TRIM(first_name) as first_name, \n", + " TRIM(address_1) as address_1, TRIM(address_2) as address_2, \n", + " TRIM(city) city, TRIM(state) as state, \n", + " TRIM(zip) as zip, TRIM(employer) as employer, \n", + " TRIM(occupation) as occupation\n", + " FROM ria_data_science_s3.raw_table)\n", + " SELECT row_number() over () as donor_id, * from tmp'''\n", + "c.execute(q)\n", + "# print('creating indexes on donors table')\n", + "# c.execute(\"CREATE INDEX donors_donor_info ON donors \"\n", + "# \"(last_name, first_name, address_1, address_2, city, \"\n", + "# \" state, zip)\")\n", + "# conn.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# print('creating recipients table...')\n", + "# c.execute(\"CREATE TABLE recipients \"\n", + "# \"(recipient_id INTEGER PRIMARY KEY AUTO_INCREMENT, name VARCHAR(70)) \"\n", + "# \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n", + "\n", + "# c.execute(\"INSERT IGNORE INTO recipients \"\n", + "# \"SELECT DISTINCT committee_id, committee_name FROM raw_table\")\n", + "# conn.commit()\n", + "\n", + "q='''\n", + "CREATE TABLE ria_data_science_s3.recipients as\n", + " SELECT DISTINCT committee_id, committee_name FROM ria_data_science_s3.raw_table\n", + "'''\n", + "c.execute(q)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "creating contributions table\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print('creating contributions table')\n", + "# c.execute(\"CREATE TABLE contributions \"\n", + "# \"(contribution_id INT, donor_id INT, recipient_id INT, \"\n", + "# \" report_type VARCHAR(24), date_recieved DATE, \"\n", + "# \" loan_amount VARCHAR(12), amount VARCHAR(23), \"\n", + "# \" receipt_type VARCHAR(23), \"\n", + "# \" vendor_last_name VARCHAR(70), \"\n", + "# \" vendor_first_name VARCHAR(20), \"\n", + "# \" vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), \"\n", + "# \" vendor_city VARCHAR(20), vendor_state VARCHAR(10), \"\n", + "# \" vendor_zip VARCHAR(10), description VARCHAR(90), \"\n", + "# \" election_type VARCHAR(10), election_year VARCHAR(10), \"\n", + "# \" report_period_begin DATE, report_period_end DATE) \"\n", + "# \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n", + "\n", + "\n", + "# c.execute(\"INSERT INTO contributions \"\n", + "# \"SELECT reciept_id, donors.donor_id, committee_id, \"\n", + "# \" report_type, STR_TO_DATE(date_recieved, '%m/%d/%Y'), \"\n", + "# \" loan_amount, amount, \"\n", + "# \" receipt_type, vendor_last_name , \"\n", + "# \" vendor_first_name, vendor_address_1, vendor_address_2, \"\n", + "# \" vendor_city, vendor_state, vendor_zip, description, \"\n", + "# \" election_type, election_year, \"\n", + "# \" STR_TO_DATE(report_period_begin, '%m/%d/%Y'), \"\n", + "# \" STR_TO_DATE(report_period_end, '%m/%d/%Y') \"\n", + "# \"FROM raw_table JOIN donors ON \"\n", + "# \"donors.first_name = TRIM(raw_table.first_name) AND \"\n", + "# \"donors.last_name = TRIM(raw_table.last_name) AND \"\n", + "# \"donors.address_1 = TRIM(raw_table.address_1) AND \"\n", + "# \"donors.address_2 = TRIM(raw_table.address_2) AND \"\n", + "# \"donors.city = TRIM(raw_table.city) AND \"\n", + "# \"donors.state = TRIM(raw_table.state) AND \"\n", + "# \"donors.employer = TRIM(raw_table.employer) AND \"\n", + "# \"donors.occupation = TRIM(raw_table.occupation) AND \"\n", + "# \"donors.zip = TRIM(raw_table.zip)\")\n", + "# conn.commit()\n", + "\n", + "q='''\n", + "CREATE TABLE ria_data_science_s3.contributions as\n", + " SELECT reciept_id, donors.donor_id, committee_id, \n", + " report_type, date_parse(date_recieved, '%m/%d/%Y') as date_recieved, \n", + " loan_amount, amount, \n", + " receipt_type, vendor_last_name , \n", + " vendor_first_name, vendor_address_1, vendor_address_2, \n", + " vendor_city, vendor_state, vendor_zip, description, \n", + " election_type, election_year, \n", + " date_parse(report_period_begin, '%m/%d/%Y') as report_period_begin, \n", + " date_parse(report_period_end, '%m/%d/%Y') as report_period_end \n", + " FROM ria_data_science_s3.raw_table JOIN ria_data_science_s3.donors ON \n", + " donors.first_name = TRIM(raw_table.first_name) AND \n", + " donors.last_name = TRIM(raw_table.last_name) AND \n", + " donors.address_1 = TRIM(raw_table.address_1) AND \n", + " donors.address_2 = TRIM(raw_table.address_2) AND \n", + " donors.city = TRIM(raw_table.city) AND \n", + " donors.state = TRIM(raw_table.state) AND \n", + " donors.employer = TRIM(raw_table.employer) AND \n", + " donors.occupation = TRIM(raw_table.occupation) AND \n", + " donors.zip = TRIM(raw_table.zip)'''\n", + "c.execute(q)\n", + "\n", + "\n", + "# print('creating indexes on contributions')\n", + "# c.execute(\"ALTER TABLE contributions ADD PRIMARY KEY(contribution_id)\")\n", + "# c.execute(\"CREATE INDEX donor_idx ON contributions (donor_id)\")\n", + "# c.execute(\"CREATE INDEX recipient_idx ON contributions (recipient_id)\")\n", + "\n", + "\n", + "# conn.commit()\n", + "\n", + "# print('nullifying empty strings in donors')\n", + "# c.execute(\"UPDATE donors \"\n", + "# \"SET \"\n", + "# \"first_name = CASE first_name WHEN '' THEN NULL ELSE first_name END, \"\n", + "# \"last_name = CASE last_name WHEN '' THEN NULL ELSE last_name END, \"\n", + "# \"address_1 = CASE address_1 WHEN '' THEN NULL ELSE address_1 END, \"\n", + "# \"address_2 = CASE address_2 WHEN '' THEN NULL ELSE address_2 END, \"\n", + "# \"city = CASE city WHEN '' THEN NULL ELSE city END, \"\n", + "# \"state = CASE state WHEN '' THEN NULL ELSE state END, \"\n", + "# \"employer = CASE employer WHEN '' THEN NULL ELSE employer END, \" \n", + "# \"occupation = CASE occupation WHEN '' THEN NULL ELSE occupation END, \" \n", + "# \"zip = CASE zip WHEN '' THEN NULL ELSE zip END\")\n", + "\n", + "\n", + "# conn.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# c.execute(\"CREATE TABLE processed_donors AS \" \n", + "# \"(SELECT donor_id, \" \n", + "# \" LOWER(city) AS city, \" \n", + "# \" CASE WHEN (first_name IS NULL AND last_name IS NULL) \"\n", + "# \" THEN NULL \"\n", + "# \" ELSE LOWER(CONCAT_WS(' ', first_name, last_name)) \"\n", + "# \" END AS name, \" \n", + "# \" LOWER(zip) AS zip, \" \n", + "# \" LOWER(state) AS state, \" \n", + "# \" CASE WHEN (address_1 IS NULL AND address_2 IS NULL) \"\n", + "# \" THEN NULL \"\n", + "# \" ELSE LOWER(CONCAT_WS(' ', address_1, address_2)) \"\n", + "# \" END AS address, \" \n", + "# \" LOWER(occupation) AS occupation, \"\n", + "# \" LOWER(employer) AS employer, \"\n", + "# \" ISNULL(first_name) AS person \"\n", + "# \" FROM donors)\")\n", + "q = '''\n", + "CREATE TABLE ria_data_science_s3.processed_donors AS \n", + " SELECT donor_id, \n", + " LOWER(city) AS city, \n", + " CASE WHEN (first_name IS NULL AND last_name IS NULL) \n", + " THEN NULL \n", + " ELSE LOWER(CONCAT(first_name, ' ', last_name)) \n", + " END AS name, \n", + " LOWER(zip) AS zip, \n", + " LOWER(state) AS state, \n", + " CASE WHEN (address_1 IS NULL AND address_2 IS NULL) \n", + " THEN NULL \n", + " ELSE LOWER(CONCAT(address_1, ' ', address_2)) \n", + " END AS address, \n", + " LOWER(occupation) AS occupation, \n", + " LOWER(employer) AS employer, \n", + " first_name is null AS person \n", + " FROM ria_data_science_s3.donors'''\n", + "c.execute(q)\n", + "\n", + "\n", + "# c.execute(\"CREATE INDEX donor_idx ON processed_donors (donor_id)\")\n", + "\n", + "# c.close()\n", + "# conn.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print('done')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1762975, 29)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/presto_example/README.md b/presto_example/README.md new file mode 100644 index 00000000..a027b3b0 --- /dev/null +++ b/presto_example/README.md @@ -0,0 +1,23 @@ +# MySQL Example + +Takes a database of IL campaign contribution data, loads it in to a +MySQL database, and identifies the unique donors. This can take a few +hours and will noticeably tax your laptop. You might want to run it +overnight. + +To follow this example you need to + +* Create a MySQL database called 'contributions' +* Copy `mysql_example/mysql.cnf_LOCAL` to `mysql_example/mysql.cnf` +* Update `mysql_example/mysql.cnf` with your MySQL username and password +* Install dependencies, `pip install -r requirements.txt` + +Once that's all done you can run the example: + +```bash +cd mysql_example +python mysql_init_db.py +python mysql_example.py +``` + + (use 'y', 'n' and 'u' keys to flag duplicates for active learning, 'f' when you are finished) diff --git a/presto_example/mysql.cnf_LOCAL b/presto_example/mysql.cnf_LOCAL new file mode 100644 index 00000000..17bded3f --- /dev/null +++ b/presto_example/mysql.cnf_LOCAL @@ -0,0 +1,4 @@ +[client] +user = your_mysql_user +password = your_mysql_password +default-character-set=utf8 diff --git a/presto_example/mysql_example.py b/presto_example/mysql_example.py new file mode 100644 index 00000000..5e257e13 --- /dev/null +++ b/presto_example/mysql_example.py @@ -0,0 +1,344 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +""" +This is an example of working with very large data. There are about +700,000 unduplicated donors in this database of Illinois political +campaign contributions. + +With such a large set of input data, we cannot store all the comparisons +we need to make in memory. Instead, we will read the pairs on demand +from the MySQL database. + +__Note:__ You will need to run `python mysql_init_db.py` +before running this script. See the annotates source for +[mysql_init_db.py](mysql_init_db.html) + +For smaller datasets (<10,000), see our +[csv_example](csv_example.html) +""" + +import os +import itertools +import time +import logging +import optparse +import locale +import json + +import MySQLdb +import MySQLdb.cursors + +import dedupe +import dedupe.backport + + +def record_pairs(result_set): + for i, row in enumerate(result_set): + a_record_id, a_record, b_record_id, b_record = row + record_a = (a_record_id, json.loads(a_record)) + record_b = (b_record_id, json.loads(b_record)) + + yield record_a, record_b + + if i % 10000 == 0: + print(i) + + +def cluster_ids(clustered_dupes): + + for cluster, scores in clustered_dupes: + cluster_id = cluster[0] + for donor_id, score in zip(cluster, scores): + yield donor_id, cluster_id, score + + +if __name__ == '__main__': + + # ## Logging + + # Dedupe uses Python logging to show or suppress verbose output. Added + # for convenience. To enable verbose output, run `python + # examples/mysql_example/mysql_example.py -v` + optp = optparse.OptionParser() + optp.add_option('-v', '--verbose', dest='verbose', action='count', + help='Increase verbosity (specify multiple times for more)' + ) + (opts, args) = optp.parse_args() + log_level = logging.WARNING + if opts.verbose: + if opts.verbose == 1: + log_level = logging.INFO + elif opts.verbose >= 2: + log_level = logging.DEBUG + logging.getLogger().setLevel(log_level) + + # ## Setup + MYSQL_CNF = os.path.abspath('.') + '/mysql.cnf' + + settings_file = 'mysql_example_settings' + training_file = 'mysql_example_training.json' + + start_time = time.time() + + # You'll need to copy `examples/mysql_example/mysql.cnf_LOCAL` to + # `examples/mysql_example/mysql.cnf` and fill in your mysql database + # information in `examples/mysql_example/mysql.cnf` + + # We use Server Side cursors (SSDictCursor and SSCursor) to [avoid + # having to have enormous result sets in + # memory](http://stackoverflow.com/questions/1808150/how-to-efficiently-use-mysqldb-sscursor). + read_con = MySQLdb.connect(db='contributions', + charset='utf8', + read_default_file=MYSQL_CNF, + cursorclass=MySQLdb.cursors.SSDictCursor) + + write_con = MySQLdb.connect(db='contributions', + charset='utf8', + read_default_file=MYSQL_CNF) + + # We'll be using variations on this following select statement to pull + # in campaign donor info. + # + # We did a fair amount of preprocessing of the fields in + # `mysql_init_db.py` + + DONOR_SELECT = "SELECT donor_id, city, name, zip, state, address " \ + "from processed_donors" + + # ## Training + + if os.path.exists(settings_file): + print('reading from ', settings_file) + with open(settings_file, 'rb') as sf: + deduper = dedupe.StaticDedupe(sf, num_cores=4) + else: + # Define the fields dedupe will pay attention to + # + # The address, city, and zip fields are often missing, so we'll + # tell dedupe that, and we'll learn a model that take that into + # account + fields = [{'field': 'name', 'type': 'String'}, + {'field': 'address', 'type': 'String', + 'has missing': True}, + {'field': 'city', 'type': 'ShortString', 'has missing': True}, + {'field': 'state', 'type': 'ShortString', 'has missing': True}, + {'field': 'zip', 'type': 'ShortString', 'has missing': True}, + ] + + # Create a new deduper object and pass our data model to it. + deduper = dedupe.Dedupe(fields, num_cores=4) + + # We will sample pairs from the entire donor table for training + with read_con.cursor() as cur: + cur.execute(DONOR_SELECT) + temp_d = {i: row for i, row in enumerate(cur)} + + # If we have training data saved from a previous run of dedupe, + # look for it an load it in. + # + # __Note:__ if you want to train from + # scratch, delete the training_file + if os.path.exists(training_file): + print('reading labeled examples from ', training_file) + with open(training_file) as tf: + deduper.prepare_training(temp_d, training_file=tf) + else: + deduper.prepare_training(temp_d) + + del temp_d + + # ## Active learning + + print('starting active labeling...') + # Starts the training loop. Dedupe will find the next pair of records + # it is least certain about and ask you to label them as duplicates + # or not. + + # use 'y', 'n' and 'u' keys to flag duplicates + # press 'f' when you are finished + dedupe.convenience.console_label(deduper) + # When finished, save our labeled, training pairs to disk + with open(training_file, 'w') as tf: + deduper.write_training(tf) + + # Notice our the argument here + # + # `recall` is the proportion of true dupes pairs that the learned + # rules must cover. You may want to reduce this if your are making + # too many blocks and too many comparisons. + deduper.train(recall=0.90) + + with open(settings_file, 'wb') as sf: + deduper.write_settings(sf) + + # We can now remove some of the memory hobbing objects we used + # for training + deduper.cleanup_training() + + # ## Blocking + + print('blocking...') + + # To run blocking on such a large set of data, we create a separate table + # that contains blocking keys and record ids + print('creating blocking_map database') + with write_con.cursor() as cur: + cur.execute("DROP TABLE IF EXISTS blocking_map") + cur.execute("CREATE TABLE blocking_map " + "(block_key VARCHAR(200), donor_id INTEGER) " + "CHARACTER SET utf8 COLLATE utf8_unicode_ci") + + write_con.commit() + + # If dedupe learned a Index Predicate, we have to take a pass + # through the data and create indices. + print('creating inverted index') + + for field in deduper.fingerprinter.index_fields: + with read_con.cursor() as cur: + cur.execute("SELECT DISTINCT {field} FROM processed_donors " + "WHERE {field} IS NOT NULL".format(field=field)) + field_data = (row[0] for row in cur) + deduper.fingerprinter.index(field_data, field) + + # Now we are ready to write our blocking map table by creating a + # generator that yields unique `(block_key, donor_id)` tuples. + print('writing blocking map') + + with read_con.cursor() as read_cur: + read_cur.execute(DONOR_SELECT) + full_data = ((row['donor_id'], row) for row in read_cur) + b_data = deduper.fingerprinter(full_data) + + with write_con.cursor() as write_cur: + + write_cur.executemany("INSERT INTO blocking_map VALUES (%s, %s)", + b_data) + + write_con.commit() + + # Free up memory by removing indices we don't need anymore + deduper.fingerprinter.reset_indices() + + # indexing blocking_map + print('creating index') + with write_con.cursor() as cur: + cur.execute("CREATE UNIQUE INDEX bm_idx ON blocking_map (block_key, donor_id)") + + write_con.commit() + read_con.commit() + + # select unique pairs to compare + with read_con.cursor(MySQLdb.cursors.SSCursor) as read_cur: + + read_cur.execute(""" + select a.donor_id, + json_object('city', a.city, + 'name', a.name, + 'zip', a.zip, + 'state', a.state, + 'address', a.address), + b.donor_id, + json_object('city', b.city, + 'name', b.name, + 'zip', b.zip, + 'state', b.state, + 'address', b.address) + from (select DISTINCT l.donor_id as east, r.donor_id as west + from blocking_map as l + INNER JOIN blocking_map as r + using (block_key) + where l.donor_id < r.donor_id) ids + INNER JOIN processed_donors a on ids.east=a.donor_id + INNER JOIN processed_donors b on ids.west=b.donor_id + """) + + # ## Clustering + + print('clustering...') + clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)), + threshold=0.5) + + with write_con.cursor() as write_cur: + + # ## Writing out results + + # We now have a sequence of tuples of donor ids that dedupe believes + # all refer to the same entity. We write this out onto an entity map + # table + write_cur.execute("DROP TABLE IF EXISTS entity_map") + + print('creating entity_map database') + write_cur.execute("CREATE TABLE entity_map " + "(donor_id INTEGER, canon_id INTEGER, " + " cluster_score FLOAT, PRIMARY KEY(donor_id))") + + write_cur.executemany('INSERT INTO entity_map VALUES (%s, %s, %s)', + cluster_ids(clustered_dupes)) + + write_con.commit() + + with write_con.cursor() as cur: + cur.execute("CREATE INDEX head_index ON entity_map (canon_id)") + + write_con.commit() + read_con.commit() + + # Print out the number of duplicates found + print('# duplicate sets') + + # ## Payoff + + # With all this done, we can now begin to ask interesting questions + # of the data + # + # For example, let's see who the top 10 donors are. + + locale.setlocale(locale.LC_ALL, '') # for pretty printing numbers + + with read_con.cursor() as cur: + # Create a temporary table so each group and unmatched record has + # a unique id + cur.execute("CREATE TEMPORARY TABLE e_map " + "SELECT IFNULL(canon_id, donor_id) AS canon_id, donor_id " + "FROM entity_map " + "RIGHT JOIN donors USING(donor_id)") + + cur.execute("SELECT CONCAT_WS(' ', donors.first_name, donors.last_name) AS name, " + "donation_totals.totals AS totals " + "FROM donors INNER JOIN " + "(SELECT canon_id, SUM(amount) AS totals " + " FROM contributions INNER JOIN e_map " + " USING (donor_id) " + " GROUP BY (canon_id) " + " ORDER BY totals " + " DESC LIMIT 10) " + "AS donation_totals " + "WHERE donors.donor_id = donation_totals.canon_id") + + print("Top Donors (deduped)") + for row in cur: + row['totals'] = locale.currency(row['totals'], grouping=True) + print('%(totals)20s: %(name)s' % row) + + # Compare this to what we would have gotten if we hadn't done any + # deduplication + cur.execute("SELECT CONCAT_WS(' ', donors.first_name, donors.last_name) as name, " + "SUM(contributions.amount) AS totals " + "FROM donors INNER JOIN contributions " + "USING (donor_id) " + "GROUP BY (donor_id) " + "ORDER BY totals DESC " + "LIMIT 10") + + print("Top Donors (raw)") + for row in cur: + row['totals'] = locale.currency(row['totals'], grouping=True) + print('%(totals)20s: %(name)s' % row) + + # Close our database connection + read_con.close() + write_con.close() + + print('ran in', time.time() - start_time, 'seconds') diff --git a/presto_example/mysql_init_db.py b/presto_example/mysql_init_db.py new file mode 100644 index 00000000..fcdc1256 --- /dev/null +++ b/presto_example/mysql_init_db.py @@ -0,0 +1,234 @@ +#!/usr/bin/python +""" +This is a setup script for mysql_example. It downloads a zip file of +Illinois campaign contributions and loads them into a MySQL database +named 'contributions'. + +__Note:__ You will need to run this script first before execuing +[mysql_example.py](mysql_example.html). + +Tables created: +* raw_table - raw import of entire CSV file +* donors - all distinct donors based on name and address +* recipients - all distinct campaign contribution recipients +* contributions - contribution amounts tied to donor and recipients tables +""" + +import os +import zipfile +import warnings + +from urllib.request import urlopen + +import MySQLdb + +warnings.filterwarnings('ignore', category=MySQLdb.Warning) + +contributions_zip_file = 'Illinois-campaign-contributions.txt.zip' +contributions_txt_file = 'Illinois-campaign-contributions.txt' + +if not os.path.exists(contributions_zip_file) : + print('downloading', contributions_zip_file, '(~60mb) ...') + u = urlopen('https://s3.amazonaws.com/dedupe-data/Illinois-campaign-contributions.txt.zip') + localFile = open(contributions_zip_file, 'wb') + localFile.write(u.read()) + localFile.close() + +if not os.path.exists(contributions_txt_file) : + zip_file = zipfile.ZipFile(contributions_zip_file, 'r') + print('extracting %s' % contributions_zip_file) + zip_file_contents = zip_file.namelist() + for f in zip_file_contents: + if ('.txt' in f): + zip_file.extract(f) + zip_file.close() + +conn = MySQLdb.connect(read_default_file = os.path.abspath('.') + '/mysql.cnf', + local_infile = 1, + sql_mode="ALLOW_INVALID_DATES", + db='contributions') +c = conn.cursor() + +print('importing raw data from csv...') +c.execute("DROP TABLE IF EXISTS raw_table") +c.execute("DROP TABLE IF EXISTS donors") +c.execute("DROP TABLE IF EXISTS recipients") +c.execute("DROP TABLE IF EXISTS contributions") +c.execute("DROP TABLE IF EXISTS processed_donors") + +c.execute("CREATE TABLE raw_table " + "(reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), " + " address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), " + " state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), " + " date_recieved VARCHAR(10), loan_amount VARCHAR(12), " + " amount VARCHAR(23), receipt_type VARCHAR(23), " + " employer VARCHAR(70), occupation VARCHAR(40), " + " vendor_last_name VARCHAR(70), vendor_first_name VARCHAR(20), " + " vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), " + " vendor_city VARCHAR(20), vendor_state VARCHAR(10), " + " vendor_zip VARCHAR(10), description VARCHAR(90), " + " election_type VARCHAR(10), election_year VARCHAR(10), " + " report_period_begin VARCHAR(10), report_period_end VARCHAR(33), " + " committee_name VARCHAR(70), committee_id VARCHAR(37)) " + "CHARACTER SET utf8 COLLATE utf8_unicode_ci") + + +conn.commit() + +c.execute("LOAD DATA LOCAL INFILE %s INTO TABLE raw_table " + "FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\r\n' " + "IGNORE 1 LINES " + "(reciept_id, last_name, first_name, " + " address_1, address_2, city, state, " + " zip, report_type, date_recieved, " + " loan_amount, amount, receipt_type, " + " employer, occupation, vendor_last_name, " + " vendor_first_name, vendor_address_1, " + " vendor_address_2, vendor_city, vendor_state, " + " vendor_zip, description, election_type, " + " election_year, " + " report_period_begin, report_period_end, " + " committee_name, committee_id, @dummy)", + (contributions_txt_file,)) + +# Remove the very few records that mess up the demo +# (demo purposes only! Don't do something like this in production) +c.execute("DELETE FROM raw_table WHERE LENGTH(date_recieved) < 10") + +# set empty, non-zero, strings in date columns to null +c.execute("UPDATE raw_table SET report_period_begin = NULL WHERE LENGTH(report_period_begin) < 10") +c.execute("UPDATE raw_table SET report_period_end = NULL WHERE LENGTH(report_period_end) < 10") + +#committee ID is requred. Remove the 2 rows that don't have it. +c.execute("DELETE FROM raw_table WHERE committee_id=''"); + +# There's a record with a date stuck in the committee_id column, which causes +# problems when inserting into the contributions table below. Get rid of it this +# way. +c.execute("DELETE FROM raw_table WHERE LENGTH( committee_id ) > 9") +conn.commit() + + + +print('creating donors table...') +c.execute("CREATE TABLE donors " + "(donor_id INTEGER PRIMARY KEY AUTO_INCREMENT, " + " last_name VARCHAR(70), first_name VARCHAR(35), " + " address_1 VARCHAR(35), address_2 VARCHAR(36), " + " city VARCHAR(20), state VARCHAR(15), " + " zip VARCHAR(11), employer VARCHAR(70), " + " occupation VARCHAR(40)) " + "CHARACTER SET utf8 COLLATE utf8_unicode_ci") +c.execute("INSERT INTO donors " + "(first_name, last_name, address_1," + " address_2, city, state, zip, employer, occupation) " + "SELECT DISTINCT " + "TRIM(first_name), TRIM(last_name), TRIM(address_1), " + "TRIM(address_2), TRIM(city), TRIM(state), TRIM(zip), " + "TRIM(employer), TRIM(occupation) " + "FROM raw_table") +conn.commit() + + +print('creating indexes on donors table') +c.execute("CREATE INDEX donors_donor_info ON donors " + "(last_name, first_name, address_1, address_2, city, " + " state, zip)") +conn.commit() + + + +print('creating recipients table...') +c.execute("CREATE TABLE recipients " + "(recipient_id INTEGER PRIMARY KEY AUTO_INCREMENT, name VARCHAR(70)) " + "CHARACTER SET utf8 COLLATE utf8_unicode_ci") + +c.execute("INSERT IGNORE INTO recipients " + "SELECT DISTINCT committee_id, committee_name FROM raw_table") +conn.commit() + +print('creating contributions table') +c.execute("CREATE TABLE contributions " + "(contribution_id INT, donor_id INT, recipient_id INT, " + " report_type VARCHAR(24), date_recieved DATE, " + " loan_amount VARCHAR(12), amount VARCHAR(23), " + " receipt_type VARCHAR(23), " + " vendor_last_name VARCHAR(70), " + " vendor_first_name VARCHAR(20), " + " vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), " + " vendor_city VARCHAR(20), vendor_state VARCHAR(10), " + " vendor_zip VARCHAR(10), description VARCHAR(90), " + " election_type VARCHAR(10), election_year VARCHAR(10), " + " report_period_begin DATE, report_period_end DATE) " + "CHARACTER SET utf8 COLLATE utf8_unicode_ci") + + +c.execute("INSERT INTO contributions " + "SELECT reciept_id, donors.donor_id, committee_id, " + " report_type, STR_TO_DATE(date_recieved, '%m/%d/%Y'), " + " loan_amount, amount, " + " receipt_type, vendor_last_name , " + " vendor_first_name, vendor_address_1, vendor_address_2, " + " vendor_city, vendor_state, vendor_zip, description, " + " election_type, election_year, " + " STR_TO_DATE(report_period_begin, '%m/%d/%Y'), " + " STR_TO_DATE(report_period_end, '%m/%d/%Y') " + "FROM raw_table JOIN donors ON " + "donors.first_name = TRIM(raw_table.first_name) AND " + "donors.last_name = TRIM(raw_table.last_name) AND " + "donors.address_1 = TRIM(raw_table.address_1) AND " + "donors.address_2 = TRIM(raw_table.address_2) AND " + "donors.city = TRIM(raw_table.city) AND " + "donors.state = TRIM(raw_table.state) AND " + "donors.employer = TRIM(raw_table.employer) AND " + "donors.occupation = TRIM(raw_table.occupation) AND " + "donors.zip = TRIM(raw_table.zip)") +conn.commit() + +print('creating indexes on contributions') +c.execute("ALTER TABLE contributions ADD PRIMARY KEY(contribution_id)") +c.execute("CREATE INDEX donor_idx ON contributions (donor_id)") +c.execute("CREATE INDEX recipient_idx ON contributions (recipient_id)") + + +conn.commit() + +print('nullifying empty strings in donors') +c.execute("UPDATE donors " + "SET " + "first_name = CASE first_name WHEN '' THEN NULL ELSE first_name END, " + "last_name = CASE last_name WHEN '' THEN NULL ELSE last_name END, " + "address_1 = CASE address_1 WHEN '' THEN NULL ELSE address_1 END, " + "address_2 = CASE address_2 WHEN '' THEN NULL ELSE address_2 END, " + "city = CASE city WHEN '' THEN NULL ELSE city END, " + "state = CASE state WHEN '' THEN NULL ELSE state END, " + "employer = CASE employer WHEN '' THEN NULL ELSE employer END, " + "occupation = CASE occupation WHEN '' THEN NULL ELSE occupation END, " + "zip = CASE zip WHEN '' THEN NULL ELSE zip END") + + +conn.commit() + +c.execute("CREATE TABLE processed_donors AS " + "(SELECT donor_id, " + " LOWER(city) AS city, " + " CASE WHEN (first_name IS NULL AND last_name IS NULL) " + " THEN NULL " + " ELSE LOWER(CONCAT_WS(' ', first_name, last_name)) " + " END AS name, " + " LOWER(zip) AS zip, " + " LOWER(state) AS state, " + " CASE WHEN (address_1 IS NULL AND address_2 IS NULL) " + " THEN NULL " + " ELSE LOWER(CONCAT_WS(' ', address_1, address_2)) " + " END AS address, " + " LOWER(occupation) AS occupation, " + " LOWER(employer) AS employer, " + " ISNULL(first_name) AS person " + " FROM donors)") + +c.execute("CREATE INDEX donor_idx ON processed_donors (donor_id)") + +c.close() +conn.close() +print('done') diff --git a/presto_example/requirements.txt b/presto_example/requirements.txt new file mode 100644 index 00000000..18c098ae --- /dev/null +++ b/presto_example/requirements.txt @@ -0,0 +1 @@ +mysqlclient From 881d0726f14dfe831621a7db3331c944b68a924a Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Fri, 19 Jun 2020 21:50:18 +0000 Subject: [PATCH 02/19] starting the example --- notebooks/mysql_example.ipynb | 595 ++++++++++++++++++++++++++++++++++ notebooks/mysql_init_db.ipynb | 173 ++-------- 2 files changed, 615 insertions(+), 153 deletions(-) create mode 100644 notebooks/mysql_example.ipynb diff --git a/notebooks/mysql_example.ipynb b/notebooks/mysql_example.ipynb new file mode 100644 index 00000000..d4ba20ac --- /dev/null +++ b/notebooks/mysql_example.ipynb @@ -0,0 +1,595 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "athena_garbage = 's3://com.ria.scratch/athena_garbage/'\n", + "bucket='com.ria.scratch'\n", + "region='eu-west-1'\n", + "workgroup = 'RIA'\n", + "root_key='as-dedupe/'\n", + "import sys\n", + "sys.path.insert(0, '../../dedupe/')\n", + "import mydedupe" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# %load ../mysql_example/mysql_example.py\n", + "#!/usr/bin/python\n", + "\n", + "\"\"\"\n", + "This is an example of working with very large data. There are about\n", + "700,000 unduplicated donors in this database of Illinois political\n", + "campaign contributions.\n", + "\n", + "With such a large set of input data, we cannot store all the comparisons\n", + "we need to make in memory. Instead, we will read the pairs on demand\n", + "from the MySQL database.\n", + "\n", + "__Note:__ You will need to run `python mysql_init_db.py`\n", + "before running this script. See the annotates source for\n", + "[mysql_init_db.py](mysql_init_db.html)\n", + "\n", + "For smaller datasets (<10,000), see our\n", + "[csv_example](csv_example.html)\n", + "\"\"\"\n", + "\n", + "import os\n", + "import itertools\n", + "import time\n", + "import logging\n", + "import optparse\n", + "import locale\n", + "import json\n", + "import pandas as pd\n", + "\n", + "# import MySQLdb\n", + "# import MySQLdb.cursors\n", + "\n", + "import dedupe\n", + "import dedupe.backport\n", + "import boto3\n", + "from pyathena import connect\n", + "from pyathena.pandas_cursor import PandasCursor\n", + "\n", + "def record_pairs(result_set):\n", + " for i, row in enumerate(result_set):\n", + " a_record_id, a_record, b_record_id, b_record = row\n", + " record_a = (a_record_id, json.loads(a_record))\n", + " record_b = (b_record_id, json.loads(b_record))\n", + "\n", + " yield record_a, record_b\n", + "\n", + " if i % 10000 == 0:\n", + " print(i)\n", + "\n", + "\n", + "def cluster_ids(clustered_dupes):\n", + "\n", + " for cluster, scores in clustered_dupes:\n", + " cluster_id = cluster[0]\n", + " for donor_id, score in zip(cluster, scores):\n", + " yield donor_id, cluster_id, score\n", + "\n", + "\n", + "if __name__ == '__main__':\n", + "\n", + " # ## Logging\n", + "\n", + " # Dedupe uses Python logging to show or suppress verbose output. Added\n", + " # for convenience. To enable verbose output, run `python\n", + " # examples/mysql_example/mysql_example.py -v`\n", + " \n", + "# optp = optparse.OptionParser()\n", + "# optp.add_option('-v', '--verbose', dest='verbose', action='count',\n", + "# help='Increase verbosity (specify multiple times for more)'\n", + "# )\n", + "# (opts, args) = optp.parse_args()\n", + "# log_level = logging.WARNING\n", + "# if opts.verbose:\n", + "# if opts.verbose == 1:\n", + "# log_level = logging.INFO\n", + "# elif opts.verbose >= 2:\n", + "# log_level = logging.DEBUG\n", + "\n", + "## Armin\n", + " log_level = logging.WARNING\n", + "#######\n", + "\n", + " logging.getLogger().setLevel(log_level)\n", + "\n", + " \n", + "\n", + "# # ## Setup\n", + "# MYSQL_CNF = os.path.abspath('.') + '/mysql.cnf'\n", + "\n", + " settings_file = 'mysql_example_settings'\n", + " training_file = 'mysql_example_training.json'\n", + "\n", + " start_time = time.time()\n", + "\n", + " # You'll need to copy `examples/mysql_example/mysql.cnf_LOCAL` to\n", + " # `examples/mysql_example/mysql.cnf` and fill in your mysql database\n", + " # information in `examples/mysql_example/mysql.cnf`\n", + "\n", + " # We use Server Side cursors (SSDictCursor and SSCursor) to [avoid\n", + " # having to have enormous result sets in\n", + " # memory](http://stackoverflow.com/questions/1808150/how-to-efficiently-use-mysqldb-sscursor).\n", + "# read_con = MySQLdb.connect(db='contributions',\n", + "# charset='utf8',\n", + "# read_default_file=MYSQL_CNF,\n", + "# cursorclass=MySQLdb.cursors.SSDictCursor)\n", + "\n", + "# write_con = MySQLdb.connect(db='contributions',\n", + "# charset='utf8',\n", + "# read_default_file=MYSQL_CNF)\n", + "\n", + "s3 = boto3.client('s3') \n", + "conn = connect(s3_staging_dir=athena_garbage,\n", + " region_name=region, work_group=workgroup)\n", + "cur = conn.cursor(PandasCursor)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + " # We'll be using variations on this following select statement to pull\n", + " # in campaign donor info.\n", + " #\n", + " # We did a fair amount of preprocessing of the fields in\n", + " # `mysql_init_db.py` \n", + " DONOR_SELECT = \"SELECT donor_id, city, name, zip, state, address \" \\\n", + " \"from ria_data_science_s3.processed_donors limit 1000\"\n", + "\n", + " # ## Training\n", + "\n", + " if os.path.exists(settings_file):\n", + " print('reading from ', settings_file)\n", + " with open(settings_file, 'rb') as sf:\n", + " deduper = dedupe.StaticDedupe(sf, num_cores=4)\n", + " else:\n", + " # Define the fields dedupe will pay attention to\n", + " #\n", + " # The address, city, and zip fields are often missing, so we'll\n", + " # tell dedupe that, and we'll learn a model that take that into\n", + " # account\n", + " fields = [{'field': 'name', 'type': 'String'},\n", + " {'field': 'address', 'type': 'String',\n", + " 'has missing': True},\n", + " {'field': 'city', 'type': 'ShortString', 'has missing': True},\n", + " {'field': 'state', 'type': 'ShortString', 'has missing': True},\n", + " {'field': 'zip', 'type': 'ShortString', 'has missing': True},\n", + " ]\n", + "\n", + " # Create a new deduper object and pass our data model to it.\n", + " deduper = mydedupe.Dedupe(fields, num_cores=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + " # We will sample pairs from the entire donor table for training\n", + "# with read_con.cursor() as cur:\n", + "# cur.execute(DONOR_SELECT)\n", + "# temp_d = {i: row for i, row in enumerate(cur)}\n", + "\n", + " #Armin: Very Suspicious, does the ssdictcursor convers everything to string?\n", + " df = cur.execute(DONOR_SELECT).as_pandas()#.astype(str)\n", + " temp_d = df.where(pd.notnull(df), None).to_dict('index')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "name : None\n", + "address : None\n", + "city : st. louis\n", + "state : mo\n", + "zip : 63118\n", + "\n", + "name : None\n", + "address : None\n", + "city : None\n", + "state : il\n", + "zip : None\n", + "\n", + "0/10 positive, 0/10 negative\n", + "Do these records refer to the same thing?\n", + "(y)es / (n)o / (u)nsure / (f)inished\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "starting active labeling...\n", + "y\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "name : nick manousopoulos\n", + "address : None\n", + "city : calumet city\n", + "state : il\n", + "zip : 60409\n", + "\n", + "name : None\n", + "address : None\n", + "city : None\n", + "state : il\n", + "zip : None\n", + "\n", + "1/10 positive, 0/10 negative\n", + "Do these records refer to the same thing?\n", + "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "y\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "name : gary h. smith\n", + "address : 205 w. wacker drive suite 510\n", + "city : chicago\n", + "state : il\n", + "zip : 60606\n", + "\n", + "name : None\n", + "address : None\n", + "city : chicago\n", + "state : il\n", + "zip : 60606\n", + "\n", + "2/10 positive, 0/10 negative\n", + "Do these records refer to the same thing?\n", + "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "y\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "name : sam vinson\n", + "address : None\n", + "city : chicago\n", + "state : il\n", + "zip : 60602\n", + "\n", + "name : john dore\n", + "address : 134 n. lasalle #1508\n", + "city : chicago\n", + "state : il\n", + "zip : 60602\n", + "\n", + "3/10 positive, 0/10 negative\n", + "Do these records refer to the same thing?\n", + "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "f\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Finished labeling\n" + ] + }, + { + "ename": "TypeError", + "evalue": "79578 is not JSON serializable", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0;31m# When finished, save our labeled, training pairs to disk\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtraining_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'w'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 27\u001b[0;31m \u001b[0mdeduper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite_training\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/api.py\u001b[0m in \u001b[0;36mwrite_training\u001b[0;34m(self, file_obj)\u001b[0m\n\u001b[1;32m 1059\u001b[0m \u001b[0mfile_obj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1060\u001b[0m \u001b[0mdefault\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mserializer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_to_json\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1061\u001b[0;31m ensure_ascii=True)\n\u001b[0m\u001b[1;32m 1062\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1063\u001b[0m def write_settings(self,\n", + "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/__init__.py\u001b[0m in \u001b[0;36mdump\u001b[0;34m(obj, fp, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)\u001b[0m\n\u001b[1;32m 177\u001b[0m \u001b[0;31m# could accelerate with writelines in some versions of Python, at\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 178\u001b[0m \u001b[0;31m# a debuggability cost\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 179\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0miterable\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 180\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 181\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode\u001b[0;34m(o, _current_indent_level)\u001b[0m\n\u001b[1;32m 428\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0m_iterencode_list\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 429\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 430\u001b[0;31m \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0m_iterencode_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 431\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 432\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmarkers\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode_dict\u001b[0;34m(dct, _current_indent_level)\u001b[0m\n\u001b[1;32m 402\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 403\u001b[0m \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_iterencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 404\u001b[0;31m \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 405\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnewline_indent\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 406\u001b[0m \u001b[0m_current_indent_level\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode_list\u001b[0;34m(lst, _current_indent_level)\u001b[0m\n\u001b[1;32m 323\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 324\u001b[0m \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_iterencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 325\u001b[0;31m \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 326\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnewline_indent\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 327\u001b[0m \u001b[0m_current_indent_level\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode_list\u001b[0;34m(lst, _current_indent_level)\u001b[0m\n\u001b[1;32m 323\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 324\u001b[0m \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_iterencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 325\u001b[0;31m \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 326\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnewline_indent\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 327\u001b[0m \u001b[0m_current_indent_level\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode_dict\u001b[0;34m(dct, _current_indent_level)\u001b[0m\n\u001b[1;32m 402\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 403\u001b[0m \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_iterencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 404\u001b[0;31m \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 405\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnewline_indent\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 406\u001b[0m \u001b[0m_current_indent_level\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode\u001b[0;34m(o, _current_indent_level)\u001b[0m\n\u001b[1;32m 435\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Circular reference detected\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[0mmarkers\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mmarkerid\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mo\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 437\u001b[0;31m \u001b[0mo\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_default\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 438\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0m_iterencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmarkers\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/serializer.py\u001b[0m in \u001b[0;36m_to_json\u001b[0;34m(python_object)\u001b[0m\n\u001b[1;32m 19\u001b[0m '__value__': list(python_object)}\n\u001b[1;32m 20\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 21\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrepr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpython_object\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m' is not JSON serializable'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 22\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mpython_object\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mTypeError\u001b[0m: 79578 is not JSON serializable" + ] + } + ], + "source": [ + " # If we have training data saved from a previous run of dedupe,\n", + " # look for it an load it in.\n", + " #\n", + " # __Note:__ if you want to train from\n", + " # scratch, delete the training_file\n", + " if os.path.exists(training_file):\n", + " print('reading labeled examples from ', training_file)\n", + " with open(training_file) as tf:\n", + " deduper.prepare_training(temp_d, training_file=tf)\n", + " else:\n", + " deduper.prepare_training(temp_d)\n", + "\n", + " del temp_d\n", + "\n", + " # ## Active learning\n", + "\n", + " print('starting active labeling...')\n", + " # Starts the training loop. Dedupe will find the next pair of records\n", + " # it is least certain about and ask you to label them as duplicates\n", + " # or not.\n", + "\n", + " # use 'y', 'n' and 'u' keys to flag duplicates\n", + " # press 'f' when you are finished\n", + " dedupe.convenience.console_label(deduper)\n", + " # When finished, save our labeled, training pairs to disk\n", + " with open(training_file, 'w') as tf:\n", + " deduper.write_training(tf)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + " # Notice our the argument here\n", + " #\n", + " # `recall` is the proportion of true dupes pairs that the learned\n", + " # rules must cover. You may want to reduce this if your are making\n", + " # too many blocks and too many comparisons.\n", + " deduper.train(recall=0.90)\n", + "\n", + " with open(settings_file, 'wb') as sf:\n", + " deduper.write_settings(sf)\n", + "\n", + " # We can now remove some of the memory hobbing objects we used\n", + " # for training\n", + " deduper.cleanup_training()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + " # ## Blocking\n", + "\n", + " print('blocking...')\n", + "\n", + " # To run blocking on such a large set of data, we create a separate table\n", + " # that contains blocking keys and record ids\n", + " print('creating blocking_map database')\n", + " with write_con.cursor() as cur:\n", + " cur.execute(\"DROP TABLE IF EXISTS blocking_map\")\n", + " cur.execute(\"CREATE TABLE blocking_map \"\n", + " \"(block_key VARCHAR(200), donor_id INTEGER) \"\n", + " \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n", + "\n", + " write_con.commit()\n", + "\n", + " # If dedupe learned a Index Predicate, we have to take a pass\n", + " # through the data and create indices.\n", + " print('creating inverted index')\n", + "\n", + " for field in deduper.fingerprinter.index_fields:\n", + " with read_con.cursor() as cur:\n", + " cur.execute(\"SELECT DISTINCT {field} FROM processed_donors \"\n", + " \"WHERE {field} IS NOT NULL\".format(field=field))\n", + " field_data = (row[0] for row in cur)\n", + " deduper.fingerprinter.index(field_data, field)\n", + "\n", + " # Now we are ready to write our blocking map table by creating a\n", + " # generator that yields unique `(block_key, donor_id)` tuples.\n", + " print('writing blocking map')\n", + "\n", + " with read_con.cursor() as read_cur:\n", + " read_cur.execute(DONOR_SELECT)\n", + " full_data = ((row['donor_id'], row) for row in read_cur)\n", + " b_data = deduper.fingerprinter(full_data)\n", + "\n", + " with write_con.cursor() as write_cur:\n", + "\n", + " write_cur.executemany(\"INSERT INTO blocking_map VALUES (%s, %s)\",\n", + " b_data)\n", + "\n", + " write_con.commit()\n", + "\n", + " # Free up memory by removing indices we don't need anymore\n", + " deduper.fingerprinter.reset_indices()\n", + "\n", + " # indexing blocking_map\n", + " print('creating index')\n", + " with write_con.cursor() as cur:\n", + " cur.execute(\"CREATE UNIQUE INDEX bm_idx ON blocking_map (block_key, donor_id)\")\n", + "\n", + " write_con.commit()\n", + " read_con.commit()\n", + "\n", + " # select unique pairs to compare\n", + " with read_con.cursor(MySQLdb.cursors.SSCursor) as read_cur:\n", + "\n", + " read_cur.execute(\"\"\"\n", + " select a.donor_id,\n", + " json_object('city', a.city,\n", + " 'name', a.name,\n", + " 'zip', a.zip,\n", + " 'state', a.state,\n", + " 'address', a.address),\n", + " b.donor_id,\n", + " json_object('city', b.city,\n", + " 'name', b.name,\n", + " 'zip', b.zip,\n", + " 'state', b.state,\n", + " 'address', b.address)\n", + " from (select DISTINCT l.donor_id as east, r.donor_id as west\n", + " from blocking_map as l\n", + " INNER JOIN blocking_map as r\n", + " using (block_key)\n", + " where l.donor_id < r.donor_id) ids\n", + " INNER JOIN processed_donors a on ids.east=a.donor_id\n", + " INNER JOIN processed_donors b on ids.west=b.donor_id\n", + " \"\"\")\n", + "\n", + " # ## Clustering\n", + "\n", + " print('clustering...')\n", + " clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)),\n", + " threshold=0.5)\n", + "\n", + " with write_con.cursor() as write_cur:\n", + "\n", + " # ## Writing out results\n", + "\n", + " # We now have a sequence of tuples of donor ids that dedupe believes\n", + " # all refer to the same entity. We write this out onto an entity map\n", + " # table\n", + " write_cur.execute(\"DROP TABLE IF EXISTS entity_map\")\n", + "\n", + " print('creating entity_map database')\n", + " write_cur.execute(\"CREATE TABLE entity_map \"\n", + " \"(donor_id INTEGER, canon_id INTEGER, \"\n", + " \" cluster_score FLOAT, PRIMARY KEY(donor_id))\")\n", + "\n", + " write_cur.executemany('INSERT INTO entity_map VALUES (%s, %s, %s)',\n", + " cluster_ids(clustered_dupes))\n", + "\n", + " write_con.commit()\n", + "\n", + " with write_con.cursor() as cur:\n", + " cur.execute(\"CREATE INDEX head_index ON entity_map (canon_id)\")\n", + "\n", + " write_con.commit()\n", + " read_con.commit()\n", + "\n", + " # Print out the number of duplicates found\n", + " print('# duplicate sets')\n", + "\n", + " # ## Payoff\n", + "\n", + " # With all this done, we can now begin to ask interesting questions\n", + " # of the data\n", + " #\n", + " # For example, let's see who the top 10 donors are.\n", + "\n", + " locale.setlocale(locale.LC_ALL, '') # for pretty printing numbers\n", + "\n", + " with read_con.cursor() as cur:\n", + " # Create a temporary table so each group and unmatched record has\n", + " # a unique id\n", + " cur.execute(\"CREATE TEMPORARY TABLE e_map \"\n", + " \"SELECT IFNULL(canon_id, donor_id) AS canon_id, donor_id \"\n", + " \"FROM entity_map \"\n", + " \"RIGHT JOIN donors USING(donor_id)\")\n", + "\n", + " cur.execute(\"SELECT CONCAT_WS(' ', donors.first_name, donors.last_name) AS name, \"\n", + " \"donation_totals.totals AS totals \"\n", + " \"FROM donors INNER JOIN \"\n", + " \"(SELECT canon_id, SUM(amount) AS totals \"\n", + " \" FROM contributions INNER JOIN e_map \"\n", + " \" USING (donor_id) \"\n", + " \" GROUP BY (canon_id) \"\n", + " \" ORDER BY totals \"\n", + " \" DESC LIMIT 10) \"\n", + " \"AS donation_totals \"\n", + " \"WHERE donors.donor_id = donation_totals.canon_id\")\n", + "\n", + " print(\"Top Donors (deduped)\")\n", + " for row in cur:\n", + " row['totals'] = locale.currency(row['totals'], grouping=True)\n", + " print('%(totals)20s: %(name)s' % row)\n", + "\n", + " # Compare this to what we would have gotten if we hadn't done any\n", + " # deduplication\n", + " cur.execute(\"SELECT CONCAT_WS(' ', donors.first_name, donors.last_name) as name, \"\n", + " \"SUM(contributions.amount) AS totals \"\n", + " \"FROM donors INNER JOIN contributions \"\n", + " \"USING (donor_id) \"\n", + " \"GROUP BY (donor_id) \"\n", + " \"ORDER BY totals DESC \"\n", + " \"LIMIT 10\")\n", + "\n", + " print(\"Top Donors (raw)\")\n", + " for row in cur:\n", + " row['totals'] = locale.currency(row['totals'], grouping=True)\n", + " print('%(totals)20s: %(name)s' % row)\n", + "\n", + " # Close our database connection\n", + " read_con.close()\n", + " write_con.close()\n", + "\n", + " print('ran in', time.time() - start_time, 'seconds')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/mysql_init_db.ipynb b/notebooks/mysql_init_db.ipynb index 1dd956d9..bb6331a1 100644 --- a/notebooks/mysql_init_db.ipynb +++ b/notebooks/mysql_init_db.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -15,11 +15,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# %load ../mysql_example/mysql_init_db.py\n", "#!/usr/bin/python\n", "\"\"\"\n", "This is a setup script for mysql_example. It downloads a zip file of\n", @@ -55,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -78,7 +77,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -96,27 +95,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "importing raw data from csv...\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "print('importing raw data from csv...')\n", "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.raw_table\")\n", @@ -128,20 +109,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# c.execute(\"CREATE TABLE raw_table \"\n", "# \"(reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), \"\n", @@ -191,43 +161,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "b'Skipping line 1441352: expected 30 fields, saw 31\\n'\n", - "b'Skipping line 1465996: expected 30 fields, saw 31\\n'\n", - "b'Skipping line 1495732: expected 30 fields, saw 31\\n'\n", - "b'Skipping line 1631504: expected 30 fields, saw 31\\nSkipping line 1631506: expected 30 fields, saw 31\\n'\n", - "b'Skipping line 1660260: expected 30 fields, saw 31\\nSkipping line 1660264: expected 30 fields, saw 32\\n'\n" - ] - }, - { - "data": { - "text/plain": [ - "{'ResponseMetadata': {'RequestId': 'C8707997FC007A2B',\n", - " 'HostId': 'pD0pZDu7WHeyS6gGA9JAV11Ns6QUZ99Iqjskl4Pvgd2V9cxZf2ulF8azIOgJnvWQ0Tv+DSJniEw=',\n", - " 'HTTPStatusCode': 200,\n", - " 'HTTPHeaders': {'x-amz-id-2': 'pD0pZDu7WHeyS6gGA9JAV11Ns6QUZ99Iqjskl4Pvgd2V9cxZf2ulF8azIOgJnvWQ0Tv+DSJniEw=',\n", - " 'x-amz-request-id': 'C8707997FC007A2B',\n", - " 'date': 'Thu, 18 Jun 2020 03:05:54 GMT',\n", - " 'x-amz-server-side-encryption': 'AES256',\n", - " 'etag': '\"42fa5ce005b346df46ed9bb9aa8fb140\"',\n", - " 'content-length': '0',\n", - " 'server': 'AmazonS3'},\n", - " 'RetryAttempts': 0},\n", - " 'ETag': '\"42fa5ce005b346df46ed9bb9aa8fb140\"',\n", - " 'ServerSideEncryption': 'AES256'}" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# c.execute(\"LOAD DATA LOCAL INFILE %s INTO TABLE raw_table \"\n", "# \"FIELDS TERMINATED BY '\\t' LINES TERMINATED BY '\\r\\n' \" \n", @@ -276,27 +212,9 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "creating donors table...\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "print('creating donors table...')\n", "# c.execute(\"CREATE TABLE donors \"\n", @@ -337,20 +255,9 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# print('creating recipients table...')\n", "# c.execute(\"CREATE TABLE recipients \"\n", @@ -370,27 +277,9 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "creating contributions table\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "print('creating contributions table')\n", "# c.execute(\"CREATE TABLE contributions \"\n", @@ -481,20 +370,9 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# c.execute(\"CREATE TABLE processed_donors AS \" \n", "# \"(SELECT donor_id, \" \n", @@ -551,20 +429,9 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(1762975, 29)" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.shape" ] From 448fdf7c6dd3ddf63e84b9ef4647dc65b7014c57 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Mon, 22 Jun 2020 21:53:59 +0000 Subject: [PATCH 03/19] debugging --- notebooks/mysql_example.ipynb | 153 ++-------------------------------- 1 file changed, 6 insertions(+), 147 deletions(-) diff --git a/notebooks/mysql_example.ipynb b/notebooks/mysql_example.ipynb index d4ba20ac..2c68caaa 100644 --- a/notebooks/mysql_example.ipynb +++ b/notebooks/mysql_example.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -140,7 +140,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -178,7 +178,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -194,150 +194,9 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "name : None\n", - "address : None\n", - "city : st. louis\n", - "state : mo\n", - "zip : 63118\n", - "\n", - "name : None\n", - "address : None\n", - "city : None\n", - "state : il\n", - "zip : None\n", - "\n", - "0/10 positive, 0/10 negative\n", - "Do these records refer to the same thing?\n", - "(y)es / (n)o / (u)nsure / (f)inished\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "starting active labeling...\n", - "y\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "name : nick manousopoulos\n", - "address : None\n", - "city : calumet city\n", - "state : il\n", - "zip : 60409\n", - "\n", - "name : None\n", - "address : None\n", - "city : None\n", - "state : il\n", - "zip : None\n", - "\n", - "1/10 positive, 0/10 negative\n", - "Do these records refer to the same thing?\n", - "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "y\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "name : gary h. smith\n", - "address : 205 w. wacker drive suite 510\n", - "city : chicago\n", - "state : il\n", - "zip : 60606\n", - "\n", - "name : None\n", - "address : None\n", - "city : chicago\n", - "state : il\n", - "zip : 60606\n", - "\n", - "2/10 positive, 0/10 negative\n", - "Do these records refer to the same thing?\n", - "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "y\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "name : sam vinson\n", - "address : None\n", - "city : chicago\n", - "state : il\n", - "zip : 60602\n", - "\n", - "name : john dore\n", - "address : 134 n. lasalle #1508\n", - "city : chicago\n", - "state : il\n", - "zip : 60602\n", - "\n", - "3/10 positive, 0/10 negative\n", - "Do these records refer to the same thing?\n", - "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "f\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Finished labeling\n" - ] - }, - { - "ename": "TypeError", - "evalue": "79578 is not JSON serializable", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0;31m# When finished, save our labeled, training pairs to disk\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtraining_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'w'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 27\u001b[0;31m \u001b[0mdeduper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite_training\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/api.py\u001b[0m in \u001b[0;36mwrite_training\u001b[0;34m(self, file_obj)\u001b[0m\n\u001b[1;32m 1059\u001b[0m \u001b[0mfile_obj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1060\u001b[0m \u001b[0mdefault\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mserializer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_to_json\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1061\u001b[0;31m ensure_ascii=True)\n\u001b[0m\u001b[1;32m 1062\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1063\u001b[0m def write_settings(self,\n", - "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/__init__.py\u001b[0m in \u001b[0;36mdump\u001b[0;34m(obj, fp, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)\u001b[0m\n\u001b[1;32m 177\u001b[0m \u001b[0;31m# could accelerate with writelines in some versions of Python, at\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 178\u001b[0m \u001b[0;31m# a debuggability cost\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 179\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0miterable\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 180\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 181\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode\u001b[0;34m(o, _current_indent_level)\u001b[0m\n\u001b[1;32m 428\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0m_iterencode_list\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 429\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 430\u001b[0;31m \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0m_iterencode_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 431\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 432\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmarkers\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode_dict\u001b[0;34m(dct, _current_indent_level)\u001b[0m\n\u001b[1;32m 402\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 403\u001b[0m \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_iterencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 404\u001b[0;31m \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 405\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnewline_indent\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 406\u001b[0m \u001b[0m_current_indent_level\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode_list\u001b[0;34m(lst, _current_indent_level)\u001b[0m\n\u001b[1;32m 323\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 324\u001b[0m \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_iterencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 325\u001b[0;31m \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 326\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnewline_indent\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 327\u001b[0m \u001b[0m_current_indent_level\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode_list\u001b[0;34m(lst, _current_indent_level)\u001b[0m\n\u001b[1;32m 323\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 324\u001b[0m \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_iterencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 325\u001b[0;31m \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 326\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnewline_indent\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 327\u001b[0m \u001b[0m_current_indent_level\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode_dict\u001b[0;34m(dct, _current_indent_level)\u001b[0m\n\u001b[1;32m 402\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 403\u001b[0m \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_iterencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 404\u001b[0;31m \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 405\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnewline_indent\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 406\u001b[0m \u001b[0m_current_indent_level\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode\u001b[0;34m(o, _current_indent_level)\u001b[0m\n\u001b[1;32m 435\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Circular reference detected\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[0mmarkers\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mmarkerid\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mo\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 437\u001b[0;31m \u001b[0mo\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_default\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 438\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0m_iterencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmarkers\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/serializer.py\u001b[0m in \u001b[0;36m_to_json\u001b[0;34m(python_object)\u001b[0m\n\u001b[1;32m 19\u001b[0m '__value__': list(python_object)}\n\u001b[1;32m 20\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 21\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrepr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpython_object\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m' is not JSON serializable'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 22\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mpython_object\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mTypeError\u001b[0m: 79578 is not JSON serializable" - ] - } - ], + "outputs": [], "source": [ " # If we have training data saved from a previous run of dedupe,\n", " # look for it an load it in.\n", From a7a2bfda90975dff4494463a528ff7cd5581975e Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Mon, 22 Jun 2020 22:07:59 +0000 Subject: [PATCH 04/19] rename --- ...sql_example.ipynb => athena_example.ipynb} | 19 +++++++++++++++---- ...sql_init_db.ipynb => athena_init_db.ipynb} | 0 2 files changed, 15 insertions(+), 4 deletions(-) rename notebooks/{mysql_example.ipynb => athena_example.ipynb} (96%) rename notebooks/{mysql_init_db.ipynb => athena_init_db.ipynb} (100%) diff --git a/notebooks/mysql_example.ipynb b/notebooks/athena_example.ipynb similarity index 96% rename from notebooks/mysql_example.ipynb rename to notebooks/athena_example.ipynb index 2c68caaa..3a4ca13a 100644 --- a/notebooks/mysql_example.ipynb +++ b/notebooks/athena_example.ipynb @@ -13,7 +13,7 @@ "root_key='as-dedupe/'\n", "import sys\n", "sys.path.insert(0, '../../dedupe/')\n", - "import mydedupe" + "import dedupe" ] }, { @@ -138,6 +138,16 @@ "cur = conn.cursor(PandasCursor)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm 'mysql_example_settings'\n", + "!rm 'mysql_example_training.json'" + ] + }, { "cell_type": "code", "execution_count": null, @@ -173,7 +183,7 @@ " ]\n", "\n", " # Create a new deduper object and pass our data model to it.\n", - " deduper = mydedupe.Dedupe(fields, num_cores=4)" + " deduper = dedupe.Dedupe(fields, num_cores=4)" ] }, { @@ -187,8 +197,9 @@ "# cur.execute(DONOR_SELECT)\n", "# temp_d = {i: row for i, row in enumerate(cur)}\n", "\n", - " #Armin: Very Suspicious, does the ssdictcursor convers everything to string?\n", - " df = cur.execute(DONOR_SELECT).as_pandas()#.astype(str)\n", + " # Armin: The problem is the donor_id, it's numpy's int64, should be converted to int! \n", + " # But for that, astype doesn't work, and a loot on temp_d is slow, so for now let's just use str\n", + " df = cur.execute(DONOR_SELECT).as_pandas().astype(str)\n", " temp_d = df.where(pd.notnull(df), None).to_dict('index')" ] }, diff --git a/notebooks/mysql_init_db.ipynb b/notebooks/athena_init_db.ipynb similarity index 100% rename from notebooks/mysql_init_db.ipynb rename to notebooks/athena_init_db.ipynb From 1d7d92ba9d49b027293265fc2069ac124b4f0fb0 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Tue, 30 Jun 2020 17:49:54 +0000 Subject: [PATCH 05/19] init ready --- {presto_example => athena_example}/README.md | 0 .../mysql.cnf_LOCAL | 0 .../mysql_example.py | 0 .../mysql_init_db.py | 0 .../requirements.txt | 0 notebooks/athena_example.ipynb | 522 ++++++++++++------ notebooks/athena_init_db.ipynb | 334 +++-------- 7 files changed, 427 insertions(+), 429 deletions(-) rename {presto_example => athena_example}/README.md (100%) rename {presto_example => athena_example}/mysql.cnf_LOCAL (100%) rename {presto_example => athena_example}/mysql_example.py (100%) rename {presto_example => athena_example}/mysql_init_db.py (100%) rename {presto_example => athena_example}/requirements.txt (100%) diff --git a/presto_example/README.md b/athena_example/README.md similarity index 100% rename from presto_example/README.md rename to athena_example/README.md diff --git a/presto_example/mysql.cnf_LOCAL b/athena_example/mysql.cnf_LOCAL similarity index 100% rename from presto_example/mysql.cnf_LOCAL rename to athena_example/mysql.cnf_LOCAL diff --git a/presto_example/mysql_example.py b/athena_example/mysql_example.py similarity index 100% rename from presto_example/mysql_example.py rename to athena_example/mysql_example.py diff --git a/presto_example/mysql_init_db.py b/athena_example/mysql_init_db.py similarity index 100% rename from presto_example/mysql_init_db.py rename to athena_example/mysql_init_db.py diff --git a/presto_example/requirements.txt b/athena_example/requirements.txt similarity index 100% rename from presto_example/requirements.txt rename to athena_example/requirements.txt diff --git a/notebooks/athena_example.ipynb b/notebooks/athena_example.ipynb index 3a4ca13a..34eb1d68 100644 --- a/notebooks/athena_example.ipynb +++ b/notebooks/athena_example.ipynb @@ -5,20 +5,32 @@ "execution_count": null, "metadata": {}, "outputs": [], + "source": [ + "!pip install dedupe pyathena" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], "source": [ "athena_garbage = 's3://com.ria.scratch/athena_garbage/'\n", "bucket='com.ria.scratch'\n", "region='eu-west-1'\n", "workgroup = 'RIA'\n", "root_key='as-dedupe/'\n", + "schema_name='ria_data_science_s3'\n", "import sys\n", "sys.path.insert(0, '../../dedupe/')\n", - "import dedupe" + "import dedupe\n", + "from io import StringIO\n", + "import csv" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -60,6 +72,11 @@ "from pyathena import connect\n", "from pyathena.pandas_cursor import PandasCursor\n", "\n", + "def dict_cursor_execute(cur, query):\n", + " df = cur.execute(query).as_pandas()\n", + " return df.where(pd.notnull(df), None).astype(str)\n", + "\n", + "\n", "def record_pairs(result_set):\n", " for i, row in enumerate(result_set):\n", " a_record_id, a_record, b_record_id, b_record = row\n", @@ -132,27 +149,35 @@ "# charset='utf8',\n", "# read_default_file=MYSQL_CNF)\n", "\n", - "s3 = boto3.client('s3') \n", - "conn = connect(s3_staging_dir=athena_garbage,\n", - " region_name=region, work_group=workgroup)\n", - "cur = conn.cursor(PandasCursor)" + " s3 = boto3.client('s3') \n", + " conn = connect(s3_staging_dir=athena_garbage,\n", + " region_name=region, work_group=workgroup)\n", + " cur = conn.cursor(PandasCursor, schema_name=schema_name)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "!rm 'mysql_example_settings'\n", - "!rm 'mysql_example_training.json'" + "# !rm 'mysql_example_settings'\n", + "# !rm 'mysql_example_training.json'" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reading from mysql_example_settings\n" + ] + } + ], "source": [ " # We'll be using variations on this following select statement to pull\n", " # in campaign donor info.\n", @@ -160,7 +185,7 @@ " # We did a fair amount of preprocessing of the fields in\n", " # `mysql_init_db.py` \n", " DONOR_SELECT = \"SELECT donor_id, city, name, zip, state, address \" \\\n", - " \"from ria_data_science_s3.processed_donors limit 1000\"\n", + " \"from processed_donors\"\n", "\n", " # ## Training\n", "\n", @@ -183,15 +208,8 @@ " ]\n", "\n", " # Create a new deduper object and pass our data model to it.\n", - " deduper = dedupe.Dedupe(fields, num_cores=4)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + " deduper = dedupe.Dedupe(fields, num_cores=4)\n", + "\n", " # We will sample pairs from the entire donor table for training\n", "# with read_con.cursor() as cur:\n", "# cur.execute(DONOR_SELECT)\n", @@ -199,16 +217,13 @@ "\n", " # Armin: The problem is the donor_id, it's numpy's int64, should be converted to int! \n", " # But for that, astype doesn't work, and a loot on temp_d is slow, so for now let's just use str\n", - " df = cur.execute(DONOR_SELECT).as_pandas().astype(str)\n", - " temp_d = df.where(pd.notnull(df), None).to_dict('index')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + " with conn.cursor(PandasCursor, schema_name=schema_name) as cursor:\n", + " # Something like this is much faster, but let's keep the changes minimal for now\n", + " # df = cur.execute(DONOR_SELECT).as_pandas().astype(str)\n", + " # temp_d = df.where(pd.notnull(df), None).to_dict('index')\n", + " cursor_df = dict_cursor_execute(cursor, DONOR_SELECT)\n", + " temp_d = cursor_df.to_dict('index')\n", + "\n", " # If we have training data saved from a previous run of dedupe,\n", " # look for it an load it in.\n", " #\n", @@ -235,15 +250,8 @@ " dedupe.convenience.console_label(deduper)\n", " # When finished, save our labeled, training pairs to disk\n", " with open(training_file, 'w') as tf:\n", - " deduper.write_training(tf)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + " deduper.write_training(tf)\n", + "\n", " # Notice our the argument here\n", " #\n", " # `recall` is the proportion of true dupes pairs that the learned\n", @@ -261,9 +269,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "blocking...\n", + "creating blocking_map database\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ " # ## Blocking\n", "\n", @@ -272,108 +299,243 @@ " # To run blocking on such a large set of data, we create a separate table\n", " # that contains blocking keys and record ids\n", " print('creating blocking_map database')\n", - " with write_con.cursor() as cur:\n", - " cur.execute(\"DROP TABLE IF EXISTS blocking_map\")\n", - " cur.execute(\"CREATE TABLE blocking_map \"\n", - " \"(block_key VARCHAR(200), donor_id INTEGER) \"\n", - " \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n", - "\n", - " write_con.commit()\n", - "\n", + "# with write_con.cursor() as cur:\n", + "# cur.execute(\"DROP TABLE IF EXISTS blocking_map\")\n", + "# cur.execute(\"CREATE TABLE blocking_map \"\n", + "# \"(block_key VARCHAR(200), donor_id INTEGER) \"\n", + "# \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n", + "\n", + "# write_con.commit()\n", + " cur.execute(\"DROP TABLE IF EXISTS blocking_map\")\n", + "\n", + " q='''\n", + " CREATE EXTERNAL TABLE blocking_map \n", + " (block_key VARCHAR(200), donor_id INTEGER)\n", + " ROW FORMAT DELIMITED\n", + " FIELDS TERMINATED BY '\\t'\n", + " LINES TERMINATED BY '\\n' \n", + " LOCATION\n", + " 's3://{}/{}' \n", + " TBLPROPERTIES (\n", + " 'classification'='csv', \n", + " --'skip.header.line.count'='1', \n", + " 'serialization.null.format'='')\n", + " '''.format(bucket, root_key+'blocking_map') \n", + " cur.execute(q)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "creating inverted index\n" + ] + } + ], + "source": [ " # If dedupe learned a Index Predicate, we have to take a pass\n", " # through the data and create indices.\n", " print('creating inverted index')\n", "\n", " for field in deduper.fingerprinter.index_fields:\n", - " with read_con.cursor() as cur:\n", - " cur.execute(\"SELECT DISTINCT {field} FROM processed_donors \"\n", - " \"WHERE {field} IS NOT NULL\".format(field=field))\n", - " field_data = (row[0] for row in cur)\n", - " deduper.fingerprinter.index(field_data, field)\n", - "\n", + " q = '''\n", + " SELECT DISTINCT {field} FROM processed_donors \n", + " WHERE {field} IS NOT NULL\n", + " '''.format(field=field)\n", + " cur_df = dict_cursor_execute(cur, q)\n", + " # Do I need to cast it as a list?\n", + " field_data = cur_df[field]\n", + " deduper.fingerprinter.index(field_data, field)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "writing blocking map\n" + ] + }, + { + "data": { + "text/plain": [ + "{'ResponseMetadata': {'RequestId': '5F215F152B811909',\n", + " 'HostId': 'B9k8koPR2pp/7lp5WxlEM2etPGjhR3aUdlJq253YoSf1Rt6N8Jo1XAWrfe7EiplzFf++YlcW238=',\n", + " 'HTTPStatusCode': 200,\n", + " 'HTTPHeaders': {'x-amz-id-2': 'B9k8koPR2pp/7lp5WxlEM2etPGjhR3aUdlJq253YoSf1Rt6N8Jo1XAWrfe7EiplzFf++YlcW238=',\n", + " 'x-amz-request-id': '5F215F152B811909',\n", + " 'date': 'Tue, 30 Jun 2020 15:27:16 GMT',\n", + " 'x-amz-server-side-encryption': 'AES256',\n", + " 'etag': '\"d41d8cd98f00b204e9800998ecf8427e\"',\n", + " 'content-length': '0',\n", + " 'server': 'AmazonS3'},\n", + " 'RetryAttempts': 0},\n", + " 'ETag': '\"d41d8cd98f00b204e9800998ecf8427e\"',\n", + " 'ServerSideEncryption': 'AES256'}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ " # Now we are ready to write our blocking map table by creating a\n", " # generator that yields unique `(block_key, donor_id)` tuples.\n", " print('writing blocking map')\n", + " \n", "\n", - " with read_con.cursor() as read_cur:\n", - " read_cur.execute(DONOR_SELECT)\n", - " full_data = ((row['donor_id'], row) for row in read_cur)\n", - " b_data = deduper.fingerprinter(full_data)\n", - "\n", - " with write_con.cursor() as write_cur:\n", - "\n", - " write_cur.executemany(\"INSERT INTO blocking_map VALUES (%s, %s)\",\n", - " b_data)\n", - "\n", - " write_con.commit()\n", + " read_cur_dict = dict_cursor_execute(cur, DONOR_SELECT).to_dict('records')\n", + " full_data = ((row['donor_id'], row) for row in read_cur_dict)\n", + " b_data = deduper.fingerprinter(full_data)\n", + " buffer = pd.DataFrame.from_records(b_data).to_csv(index=False, header=False, sep='\\t')\n", + "# csv_out.writerows(b_data) \n", "\n", - " # Free up memory by removing indices we don't need anymore\n", - " deduper.fingerprinter.reset_indices()\n", + "# \"\\n\".join(b_data)\n", + "# with write_con.cursor() as write_cur:\n", "\n", + "# write_cur.executemany(\"INSERT INTO blocking_map VALUES (%s, %s)\",\n", + "# b_data)\n", + " s3.put_object(Bucket=bucket, Key=root_key+'blocking_map/blocking.csv', Body=buffer) \n", + "# write_con.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ " # indexing blocking_map\n", - " print('creating index')\n", - " with write_con.cursor() as cur:\n", - " cur.execute(\"CREATE UNIQUE INDEX bm_idx ON blocking_map (block_key, donor_id)\")\n", + "# print('creating index')\n", + "# with write_con.cursor() as cur:\n", + "# cur.execute(\"CREATE UNIQUE INDEX bm_idx ON blocking_map (block_key, donor_id)\")\n", "\n", - " write_con.commit()\n", - " read_con.commit()\n", + "# write_con.commit()\n", + "# read_con.commit()\n", "\n", " # select unique pairs to compare\n", - " with read_con.cursor(MySQLdb.cursors.SSCursor) as read_cur:\n", - "\n", - " read_cur.execute(\"\"\"\n", - " select a.donor_id,\n", - " json_object('city', a.city,\n", - " 'name', a.name,\n", - " 'zip', a.zip,\n", - " 'state', a.state,\n", - " 'address', a.address),\n", - " b.donor_id,\n", - " json_object('city', b.city,\n", - " 'name', b.name,\n", - " 'zip', b.zip,\n", - " 'state', b.state,\n", - " 'address', b.address)\n", - " from (select DISTINCT l.donor_id as east, r.donor_id as west\n", - " from blocking_map as l\n", - " INNER JOIN blocking_map as r\n", - " using (block_key)\n", - " where l.donor_id < r.donor_id) ids\n", - " INNER JOIN processed_donors a on ids.east=a.donor_id\n", - " INNER JOIN processed_donors b on ids.west=b.donor_id\n", - " \"\"\")\n", - "\n", - " # ## Clustering\n", - "\n", - " print('clustering...')\n", - " clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)),\n", - " threshold=0.5)\n", - "\n", - " with write_con.cursor() as write_cur:\n", - "\n", - " # ## Writing out results\n", - "\n", - " # We now have a sequence of tuples of donor ids that dedupe believes\n", - " # all refer to the same entity. We write this out onto an entity map\n", - " # table\n", - " write_cur.execute(\"DROP TABLE IF EXISTS entity_map\")\n", - "\n", - " print('creating entity_map database')\n", - " write_cur.execute(\"CREATE TABLE entity_map \"\n", - " \"(donor_id INTEGER, canon_id INTEGER, \"\n", - " \" cluster_score FLOAT, PRIMARY KEY(donor_id))\")\n", - "\n", - " write_cur.executemany('INSERT INTO entity_map VALUES (%s, %s, %s)',\n", - " cluster_ids(clustered_dupes))\n", - "\n", - " write_con.commit()\n", - "\n", - " with write_con.cursor() as cur:\n", - " cur.execute(\"CREATE INDEX head_index ON entity_map (canon_id)\")\n", - "\n", - " write_con.commit()\n", - " read_con.commit()\n", + " q='''\n", + " SELECT a.donor_id,\n", + " json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'],\n", + " ARRAY[ a.city, a.name, a.zip, a.state, a.address])\n", + " AS JSON)),\n", + " b.donor_id,\n", + " json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], \n", + " ARRAY[ b.city, b.name, b.zip, b.state, b.address])\n", + " AS JSON))\n", + " FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west\n", + " from blocking_map as l\n", + " INNER JOIN blocking_map as r\n", + " using (block_key)\n", + " where l.donor_id < r.donor_id) ids\n", + " INNER JOIN processed_donors a on ids.east=a.donor_id\n", + " INNER JOIN processed_donors b on ids.west=b.donor_id\n", + " '''\n", + " read_cur_dict=dict_cursor_execute(cur, q).itertuples(index=False, name=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "ename": "StopIteration", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mStopIteration\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mread_cur_dict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mStopIteration\u001b[0m: " + ] + } + ], + "source": [ + "next(read_cur_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "clustering...\n" + ] + }, + { + "ename": "BlockingError", + "evalue": "No records have been blocked together. Is the data you are trying to match like the data you trained on?", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mBlockingError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'clustering...'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur_dict)),\n\u001b[0m\u001b[1;32m 5\u001b[0m threshold=0.5)\n", + "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/api.py\u001b[0m in \u001b[0;36mscore\u001b[0;34m(self, pairs)\u001b[0m\n\u001b[1;32m 104\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata_model\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 105\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclassifier\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 106\u001b[0;31m self.num_cores)\n\u001b[0m\u001b[1;32m 107\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 108\u001b[0m raise RuntimeError('''\n", + "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/core.py\u001b[0m in \u001b[0;36mscoreDuplicates\u001b[0;34m(record_pairs, data_model, classifier, num_cores)\u001b[0m\n\u001b[1;32m 218\u001b[0m \u001b[0mfirst\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrecord_pairs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpeek\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrecord_pairs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 219\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mfirst\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 220\u001b[0;31m raise BlockingError(\"No records have been blocked together. \"\n\u001b[0m\u001b[1;32m 221\u001b[0m \u001b[0;34m\"Is the data you are trying to match like \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 222\u001b[0m \"the data you trained on?\")\n", + "\u001b[0;31mBlockingError\u001b[0m: No records have been blocked together. Is the data you are trying to match like the data you trained on?" + ] + } + ], + "source": [ + " # ## Clustering\n", "\n", + " print('clustering...')\n", + " clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur_dict)),\n", + " threshold=0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + " cur.execute(\"DROP TABLE IF EXISTS entity_map\")\n", + "\n", + " print('creating entity_map database')\n", + " q='''\n", + " CREATE EXTERNAL TABLE entity_map \n", + " (donor_id INTEGER, canon_id INTEGER, \n", + " cluster_score FLOAT)\n", + " ROW FORMAT DELIMITED\n", + " FIELDS TERMINATED BY '\\t'\n", + " LINES TERMINATED BY '\\n' \n", + " LOCATION\n", + " 's3://{}/{}' \n", + " TBLPROPERTIES (\n", + " 'classification'='csv', \n", + " --'skip.header.line.count'='1', \n", + " 'serialization.null.format'='')\n", + " '''.format(bucket, root_key+'entity_map') \n", + " cur.execute(q) \n", + "\n", + " buffer = pd.DataFrame.from_records(cluster_ids(clustered_dupes)).to_csv(index=False, header=False, sep='\\t')\n", + " s3.put_object(Bucket=bucket, Key=root_key+'entity_map/entity_map.csv', Body=buffer) \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ " # Print out the number of duplicates found\n", " print('# duplicate sets')\n", "\n", @@ -385,52 +547,36 @@ " # For example, let's see who the top 10 donors are.\n", "\n", " locale.setlocale(locale.LC_ALL, '') # for pretty printing numbers\n", - "\n", - " with read_con.cursor() as cur:\n", - " # Create a temporary table so each group and unmatched record has\n", - " # a unique id\n", - " cur.execute(\"CREATE TEMPORARY TABLE e_map \"\n", - " \"SELECT IFNULL(canon_id, donor_id) AS canon_id, donor_id \"\n", - " \"FROM entity_map \"\n", - " \"RIGHT JOIN donors USING(donor_id)\")\n", - "\n", - " cur.execute(\"SELECT CONCAT_WS(' ', donors.first_name, donors.last_name) AS name, \"\n", - " \"donation_totals.totals AS totals \"\n", - " \"FROM donors INNER JOIN \"\n", - " \"(SELECT canon_id, SUM(amount) AS totals \"\n", - " \" FROM contributions INNER JOIN e_map \"\n", - " \" USING (donor_id) \"\n", - " \" GROUP BY (canon_id) \"\n", - " \" ORDER BY totals \"\n", - " \" DESC LIMIT 10) \"\n", - " \"AS donation_totals \"\n", - " \"WHERE donors.donor_id = donation_totals.canon_id\")\n", - "\n", - " print(\"Top Donors (deduped)\")\n", - " for row in cur:\n", - " row['totals'] = locale.currency(row['totals'], grouping=True)\n", - " print('%(totals)20s: %(name)s' % row)\n", - "\n", - " # Compare this to what we would have gotten if we hadn't done any\n", - " # deduplication\n", - " cur.execute(\"SELECT CONCAT_WS(' ', donors.first_name, donors.last_name) as name, \"\n", - " \"SUM(contributions.amount) AS totals \"\n", - " \"FROM donors INNER JOIN contributions \"\n", - " \"USING (donor_id) \"\n", - " \"GROUP BY (donor_id) \"\n", - " \"ORDER BY totals DESC \"\n", - " \"LIMIT 10\")\n", - "\n", - " print(\"Top Donors (raw)\")\n", - " for row in cur:\n", - " row['totals'] = locale.currency(row['totals'], grouping=True)\n", - " print('%(totals)20s: %(name)s' % row)\n", - "\n", - " # Close our database connection\n", - " read_con.close()\n", - " write_con.close()\n", - "\n", - " print('ran in', time.time() - start_time, 'seconds')" + " \n", + " cur.execute(\"DROP TABLE IF EXISTS e_map\")\n", + "\n", + " q = '''\n", + " CREATE TABLE e_map as \n", + " SELECT COALESCE(canon_id, entity_map.donor_id) AS canon_id, entity_map.donor_id \n", + " FROM entity_map \n", + " RIGHT JOIN donors USING(donor_id)\n", + " '''\n", + " \n", + " cur.execute(q)\n", + " q ='''\n", + " SELECT array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name, \n", + " donation_totals.totals AS totals \n", + " FROM donors INNER JOIN \n", + " (SELECT canon_id, SUM(cast (amount as double)) AS totals \n", + " FROM contributions INNER JOIN e_map \n", + " USING (donor_id) \n", + " GROUP BY (canon_id) \n", + " ORDER BY totals \n", + " DESC LIMIT 10) \n", + " AS donation_totals \n", + " ON donors.donor_id = donation_totals.canon_id\n", + " '''\n", + " cur_dict = dict_cursor_execute(cur, q).to_dict('records')\n", + "\n", + " print(\"Top Donors (deduped)\")\n", + " for row in cur_dict:\n", + " row['totals'] = locale.currency(row['totals'], grouping=True)\n", + " print('%(totals)20s: %(name)s' % row)" ] }, { @@ -438,7 +584,33 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + " # Compare this to what we would have gotten if we hadn't done any\n", + " # deduplication\n", + "\n", + " q = '''\n", + " SELECT array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name,\n", + " SUM(cast(contributions.amount as double)) AS totals \n", + " FROM donors INNER JOIN contributions \n", + " USING (donor_id) \n", + " GROUP BY donor_id), name\n", + " ORDER BY totals DESC \n", + " LIMIT 10\")\n", + " '''\n", + "\n", + " cur_dict = dict_cursor_execute(cur, q).to_dict('records')\n", + "\n", + " print(\"Top Donors (raw)\")\n", + " for row in cur:\n", + " row['totals'] = locale.currency(row['totals'], grouping=True)\n", + " print('%(totals)20s: %(name)s' % row)\n", + "\n", + " # Close our database connection\n", + "# read_con.close()\n", + "# write_con.close()\n", + "\n", + " print('ran in', time.time() - start_time, 'seconds')" + ] } ], "metadata": { diff --git a/notebooks/athena_init_db.ipynb b/notebooks/athena_init_db.ipynb index bb6331a1..b86d1b9f 100644 --- a/notebooks/athena_init_db.ipynb +++ b/notebooks/athena_init_db.ipynb @@ -2,31 +2,54 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting ../athena_example/config.py\n" + ] + } + ], "source": [ - "athena_garbage = 's3://com.ria.scratch/athena_garbage/'\n", - "bucket='com.ria.scratch'\n", - "region='eu-west-1'\n", - "workgroup = 'RIA'\n", - "root_key='as-dedupe/'" + "%%writefile ../athena_example/config.py\n", + "# Connection parameters\n", + "ACCESS_KEY_ID = None\n", + "SECRET_ACCESS_KEY = None\n", + "ATHENA_GARBAGE_PATH = 's3://com.ria.scratch/athena_garbage/'\n", + "WORKGROUP = 'RIA'\n", + "REGION = 'eu-west-1'\n", + "\n", + "# Database Parameters\n", + "DATABASE_BUCKET = 'com.ria.scratch'\n", + "DATABASE_ROOT_KEY = 'as-dedupe/'" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting ../athena_example/athena_example.py\n" + ] + } + ], "source": [ + "%%writefile ../athena_example/athena_example.py\n", "#!/usr/bin/python\n", "\"\"\"\n", - "This is a setup script for mysql_example. It downloads a zip file of\n", - "Illinois campaign contributions and loads them into a MySQL database\n", + "This is a setup script for athena_example. It downloads a zip file of\n", + "Illinois campaign contributions and loads them into a Athena database\n", "named 'contributions'.\n", " \n", "__Note:__ You will need to run this script first before execuing\n", - "[mysql_example.py](mysql_example.html).\n", + "[athena_example.py](athena_example.py).\n", " \n", "Tables created:\n", "* raw_table - raw import of entire CSV file\n", @@ -43,21 +66,12 @@ "from urllib.request import urlopen\n", "import boto3\n", "from pyathena import connect\n", + "import config\n", "\n", - "# import MySQLdb\n", - "\n", - "# warnings.filterwarnings('ignore', category=MySQLdb.Warning)\n", "\n", "contributions_zip_file = 'Illinois-campaign-contributions.txt.zip'\n", - "contributions_txt_file = 'Illinois-campaign-contributions.txt'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "contributions_txt_file = 'Illinois-campaign-contributions.txt'\n", + "\n", "if not os.path.exists(contributions_zip_file) :\n", " print('downloading', contributions_zip_file, '(~60mb) ...')\n", " u = urlopen('https://s3.amazonaws.com/dedupe-data/Illinois-campaign-contributions.txt.zip')\n", @@ -72,65 +86,27 @@ " for f in zip_file_contents:\n", " if ('.txt' in f):\n", " zip_file.extract(f)\n", - " zip_file.close()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# conn = MySQLdb.connect(read_default_file = os.path.abspath('.') + '/mysql.cnf', \n", - "# local_infile = 1,\n", - "# sql_mode=\"ALLOW_INVALID_DATES\",\n", - "# db='contributions')\n", - "# c = conn.cursor()\n", + " zip_file.close()\n", + "\n", "\n", "s3 = boto3.client('s3') \n", - "conn = connect(s3_staging_dir=athena_garbage,\n", - " region_name=region, work_group=workgroup)\n", - "c = conn.cursor()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "\n", + "\n", + "conn = connect(aws_access_key_id=config.ACCESS_KEY_ID,\n", + " aws_secret_access_key=config.SECRET_ACCESS_KEY,\n", + " s3_staging_dir=config.ATHENA_GARBAGE_PATH,\n", + " region_name=config.REGION, \n", + " work_group=config.WORKGROUP)\n", + "c = conn.cursor()\n", + "\n", "print('importing raw data from csv...')\n", "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.raw_table\")\n", "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.donors\")\n", "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.recipients\")\n", "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.contributions\")\n", - "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.processed_donors\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# c.execute(\"CREATE TABLE raw_table \"\n", - "# \"(reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), \"\n", - "# \" address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), \"\n", - "# \" state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), \"\n", - "# \" date_recieved VARCHAR(10), loan_amount VARCHAR(12), \"\n", - "# \" amount VARCHAR(23), receipt_type VARCHAR(23), \"\n", - "# \" employer VARCHAR(70), occupation VARCHAR(40), \"\n", - "# \" vendor_last_name VARCHAR(70), vendor_first_name VARCHAR(20), \"\n", - "# \" vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), \"\n", - "# \" vendor_city VARCHAR(20), vendor_state VARCHAR(10), \"\n", - "# \" vendor_zip VARCHAR(10), description VARCHAR(90), \"\n", - "# \" election_type VARCHAR(10), election_year VARCHAR(10), \"\n", - "# \" report_period_begin VARCHAR(10), report_period_end VARCHAR(33), \"\n", - "# \" committee_name VARCHAR(70), committee_id VARCHAR(37)) \"\n", - "# \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n", + "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.processed_donors\")\n", "\n", "\n", - "# conn.commit()\n", "q=r'''\n", "CREATE EXTERNAL TABLE ria_data_science_s3.raw_table \n", " (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), \n", @@ -155,85 +131,35 @@ " 'classification'='csv', \n", " 'skip.header.line.count'='1', \n", " 'serialization.null.format'='')\n", - "'''.format(bucket, root_key+'raw_table') \n", - "c.execute(q)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# c.execute(\"LOAD DATA LOCAL INFILE %s INTO TABLE raw_table \"\n", - "# \"FIELDS TERMINATED BY '\\t' LINES TERMINATED BY '\\r\\n' \" \n", - "# \"IGNORE 1 LINES \"\n", - "# \"(reciept_id, last_name, first_name, \"\n", - "# \" address_1, address_2, city, state, \"\n", - "# \" zip, report_type, date_recieved, \"\n", - "# \" loan_amount, amount, receipt_type, \"\n", - "# \" employer, occupation, vendor_last_name, \"\n", - "# \" vendor_first_name, vendor_address_1, \"\n", - "# \" vendor_address_2, vendor_city, vendor_state, \"\n", - "# \" vendor_zip, description, election_type, \"\n", - "# \" election_year, \"\n", - "# \" report_period_begin, report_period_end, \"\n", - "# \" committee_name, committee_id, @dummy)\",\n", - "# (contributions_txt_file,))\n", + "'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'raw_table') \n", + "c.execute(q)\n", + "\n", + "\n", "\n", "df = pd.read_csv(contributions_txt_file, sep='\\t', error_bad_lines=False, dtype=str, index_col=0)\n", "# Remove the very few records that mess up the demo \n", "# (demo purposes only! Don't do something like this in production)\n", - "# c.execute(\"DELETE FROM raw_table WHERE LENGTH(date_recieved) < 10\")\n", "df = df[df['RcvDate'].str.len()>=10]\n", "\n", "# set empty, non-zero, strings in date columns to null\n", - "# c.execute(\"UPDATE raw_table SET report_period_begin = NULL WHERE LENGTH(report_period_begin) < 10\")\n", "df.loc[df['RptPdBegDate'].str.len()<10,'RptPdBegDate'] = np.nan\n", "\n", - "# c.execute(\"UPDATE raw_table SET report_period_end = NULL WHERE LENGTH(report_period_end) < 10\")\n", "df.loc[df['RptPdEndDate'].str.len()<10,'RptPdEndDate'] = np.nan\n", "\n", "#committee ID is requred. Remove the 2 rows that don't have it.\n", - "# c.execute(\"DELETE FROM raw_table WHERE committee_id=''\");\n", "df = df[df['ID'] != '']\n", "\n", "# There's a record with a date stuck in the committee_id column, which causes\n", "# problems when inserting into the contributions table below. Get rid of it this \n", "# way.\n", - "# c.execute(\"DELETE FROM raw_table WHERE LENGTH( committee_id ) > 9\")\n", "df = df[df['ID'].str.len() <=9]\n", "\n", "# Nullifying empty strings\n", "df = df.replace(r'^\\s*$', np.nan, regex=True)\n", "\n", - "s3.put_object(Bucket=bucket, Key=root_key+'raw_table/'+contributions_txt_file, Body=df.to_csv(sep=\"\\t\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'raw_table/'+contributions_txt_file, Body=df.to_csv(sep=\"\\t\"))\n", + "\n", "print('creating donors table...')\n", - "# c.execute(\"CREATE TABLE donors \"\n", - "# \"(donor_id INTEGER PRIMARY KEY AUTO_INCREMENT, \"\n", - "# \" last_name VARCHAR(70), first_name VARCHAR(35), \"\n", - "# \" address_1 VARCHAR(35), address_2 VARCHAR(36), \"\n", - "# \" city VARCHAR(20), state VARCHAR(15), \"\n", - "# \" zip VARCHAR(11), employer VARCHAR(70), \"\n", - "# \" occupation VARCHAR(40)) \"\n", - "# \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n", - "# c.execute(\"INSERT INTO donors \"\n", - "# \"(first_name, last_name, address_1,\"\n", - "# \" address_2, city, state, zip, employer, occupation) \"\n", - "# \"SELECT DISTINCT \"\n", - "# \"TRIM(first_name), TRIM(last_name), TRIM(address_1), \"\n", - "# \"TRIM(address_2), TRIM(city), TRIM(state), TRIM(zip), \"\n", - "# \"TRIM(employer), TRIM(occupation) \"\n", - "# \"FROM raw_table\")\n", - "# conn.commit()\n", "q='''\n", "CREATE TABLE ria_data_science_s3.donors as\n", " with tmp as\n", @@ -246,79 +172,15 @@ " FROM ria_data_science_s3.raw_table)\n", " SELECT row_number() over () as donor_id, * from tmp'''\n", "c.execute(q)\n", - "# print('creating indexes on donors table')\n", - "# c.execute(\"CREATE INDEX donors_donor_info ON donors \"\n", - "# \"(last_name, first_name, address_1, address_2, city, \"\n", - "# \" state, zip)\")\n", - "# conn.commit()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# print('creating recipients table...')\n", - "# c.execute(\"CREATE TABLE recipients \"\n", - "# \"(recipient_id INTEGER PRIMARY KEY AUTO_INCREMENT, name VARCHAR(70)) \"\n", - "# \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n", "\n", - "# c.execute(\"INSERT IGNORE INTO recipients \"\n", - "# \"SELECT DISTINCT committee_id, committee_name FROM raw_table\")\n", - "# conn.commit()\n", "\n", "q='''\n", "CREATE TABLE ria_data_science_s3.recipients as\n", " SELECT DISTINCT committee_id, committee_name FROM ria_data_science_s3.raw_table\n", "'''\n", - "c.execute(q)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print('creating contributions table')\n", - "# c.execute(\"CREATE TABLE contributions \"\n", - "# \"(contribution_id INT, donor_id INT, recipient_id INT, \"\n", - "# \" report_type VARCHAR(24), date_recieved DATE, \"\n", - "# \" loan_amount VARCHAR(12), amount VARCHAR(23), \"\n", - "# \" receipt_type VARCHAR(23), \"\n", - "# \" vendor_last_name VARCHAR(70), \"\n", - "# \" vendor_first_name VARCHAR(20), \"\n", - "# \" vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), \"\n", - "# \" vendor_city VARCHAR(20), vendor_state VARCHAR(10), \"\n", - "# \" vendor_zip VARCHAR(10), description VARCHAR(90), \"\n", - "# \" election_type VARCHAR(10), election_year VARCHAR(10), \"\n", - "# \" report_period_begin DATE, report_period_end DATE) \"\n", - "# \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n", - "\n", - "\n", - "# c.execute(\"INSERT INTO contributions \"\n", - "# \"SELECT reciept_id, donors.donor_id, committee_id, \"\n", - "# \" report_type, STR_TO_DATE(date_recieved, '%m/%d/%Y'), \"\n", - "# \" loan_amount, amount, \"\n", - "# \" receipt_type, vendor_last_name , \"\n", - "# \" vendor_first_name, vendor_address_1, vendor_address_2, \"\n", - "# \" vendor_city, vendor_state, vendor_zip, description, \"\n", - "# \" election_type, election_year, \"\n", - "# \" STR_TO_DATE(report_period_begin, '%m/%d/%Y'), \"\n", - "# \" STR_TO_DATE(report_period_end, '%m/%d/%Y') \"\n", - "# \"FROM raw_table JOIN donors ON \"\n", - "# \"donors.first_name = TRIM(raw_table.first_name) AND \"\n", - "# \"donors.last_name = TRIM(raw_table.last_name) AND \"\n", - "# \"donors.address_1 = TRIM(raw_table.address_1) AND \"\n", - "# \"donors.address_2 = TRIM(raw_table.address_2) AND \"\n", - "# \"donors.city = TRIM(raw_table.city) AND \"\n", - "# \"donors.state = TRIM(raw_table.state) AND \"\n", - "# \"donors.employer = TRIM(raw_table.employer) AND \"\n", - "# \"donors.occupation = TRIM(raw_table.occupation) AND \"\n", - "# \"donors.zip = TRIM(raw_table.zip)\")\n", - "# conn.commit()\n", + "c.execute(q)\n", "\n", + "print('creating contributions table')\n", "q='''\n", "CREATE TABLE ria_data_science_s3.contributions as\n", " SELECT reciept_id, donors.donor_id, committee_id, \n", @@ -342,55 +204,6 @@ " donors.zip = TRIM(raw_table.zip)'''\n", "c.execute(q)\n", "\n", - "\n", - "# print('creating indexes on contributions')\n", - "# c.execute(\"ALTER TABLE contributions ADD PRIMARY KEY(contribution_id)\")\n", - "# c.execute(\"CREATE INDEX donor_idx ON contributions (donor_id)\")\n", - "# c.execute(\"CREATE INDEX recipient_idx ON contributions (recipient_id)\")\n", - "\n", - "\n", - "# conn.commit()\n", - "\n", - "# print('nullifying empty strings in donors')\n", - "# c.execute(\"UPDATE donors \"\n", - "# \"SET \"\n", - "# \"first_name = CASE first_name WHEN '' THEN NULL ELSE first_name END, \"\n", - "# \"last_name = CASE last_name WHEN '' THEN NULL ELSE last_name END, \"\n", - "# \"address_1 = CASE address_1 WHEN '' THEN NULL ELSE address_1 END, \"\n", - "# \"address_2 = CASE address_2 WHEN '' THEN NULL ELSE address_2 END, \"\n", - "# \"city = CASE city WHEN '' THEN NULL ELSE city END, \"\n", - "# \"state = CASE state WHEN '' THEN NULL ELSE state END, \"\n", - "# \"employer = CASE employer WHEN '' THEN NULL ELSE employer END, \" \n", - "# \"occupation = CASE occupation WHEN '' THEN NULL ELSE occupation END, \" \n", - "# \"zip = CASE zip WHEN '' THEN NULL ELSE zip END\")\n", - "\n", - "\n", - "# conn.commit()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# c.execute(\"CREATE TABLE processed_donors AS \" \n", - "# \"(SELECT donor_id, \" \n", - "# \" LOWER(city) AS city, \" \n", - "# \" CASE WHEN (first_name IS NULL AND last_name IS NULL) \"\n", - "# \" THEN NULL \"\n", - "# \" ELSE LOWER(CONCAT_WS(' ', first_name, last_name)) \"\n", - "# \" END AS name, \" \n", - "# \" LOWER(zip) AS zip, \" \n", - "# \" LOWER(state) AS state, \" \n", - "# \" CASE WHEN (address_1 IS NULL AND address_2 IS NULL) \"\n", - "# \" THEN NULL \"\n", - "# \" ELSE LOWER(CONCAT_WS(' ', address_1, address_2)) \"\n", - "# \" END AS address, \" \n", - "# \" LOWER(occupation) AS occupation, \"\n", - "# \" LOWER(employer) AS employer, \"\n", - "# \" ISNULL(first_name) AS person \"\n", - "# \" FROM donors)\")\n", "q = '''\n", "CREATE TABLE ria_data_science_s3.processed_donors AS \n", " SELECT donor_id, \n", @@ -412,19 +225,34 @@ "c.execute(q)\n", "\n", "\n", - "# c.execute(\"CREATE INDEX donor_idx ON processed_donors (donor_id)\")\n", "\n", - "# c.close()\n", - "# conn.close()" + "\n", + "print('done')" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "importing raw data from csv...\n", + "b'Skipping line 1441352: expected 30 fields, saw 31\\n'\n", + "b'Skipping line 1465996: expected 30 fields, saw 31\\n'\n", + "b'Skipping line 1495732: expected 30 fields, saw 31\\n'\n", + "b'Skipping line 1631504: expected 30 fields, saw 31\\nSkipping line 1631506: expected 30 fields, saw 31\\n'\n", + "b'Skipping line 1660260: expected 30 fields, saw 31\\nSkipping line 1660264: expected 30 fields, saw 32\\n'\n", + "creating donors table...\n", + "creating contributions table\n", + "done\n" + ] + } + ], "source": [ - "print('done')" + "!python ../athena_example/athena_example.py" ] }, { @@ -432,9 +260,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "df.shape" - ] + "source": [] } ], "metadata": { From 19ff16999bdf5e7455fd358168c9d4c9ea94d5b5 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Thu, 16 Jul 2020 15:57:25 +0000 Subject: [PATCH 06/19] new datasets --- notebooks/athena_example.ipynb | 374 ++++++++++++++++++++------------- notebooks/athena_init_db.ipynb | 76 ++++--- 2 files changed, 275 insertions(+), 175 deletions(-) diff --git a/notebooks/athena_example.ipynb b/notebooks/athena_example.ipynb index 34eb1d68..01c42392 100644 --- a/notebooks/athena_example.ipynb +++ b/notebooks/athena_example.ipynb @@ -2,37 +2,134 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting dedupe\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/5e/09/179feb316147279c76ea7e6dc5a5f9e00a6feadaeda131d535247e580619/dedupe-2.0.3-cp36-cp36m-manylinux1_x86_64.whl (89kB)\n", + "\u001b[K 100% |████████████████████████████████| 92kB 239kB/s ta 0:00:011\n", + "\u001b[?25hCollecting pyathena\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/40/85/f37c049922f5d47e9126d7817ef7b8fb7abb2e6a9ea0dd06adcbffc0e8bc/PyAthena-1.10.8-py2.py3-none-any.whl (53kB)\n", + "\u001b[K 100% |████████████████████████████████| 61kB 1.9MB/s ta 0:00:011\n", + "\u001b[?25hCollecting haversine>=0.4.1 (from dedupe)\n", + " Downloading https://files.pythonhosted.org/packages/72/8e/6df8b563dd6b2961a36cd740b34c00b89142f1b97d92092c133379b2973f/haversine-2.2.0-py2.py3-none-any.whl\n", + "Collecting simplecosine>=1.2 (from dedupe)\n", + " Downloading https://files.pythonhosted.org/packages/2d/22/6ea3a5ab8aea06d6563eb927e706f7342a00d1849c9be6143a2a7d84ddbd/simplecosine-1.2-py2.py3-none-any.whl\n", + "Collecting rlr>=2.4.3 (from dedupe)\n", + " Downloading https://files.pythonhosted.org/packages/fa/02/3b1a9727a622ff4320919645ce35ceb887d90784d0bab41484756c33b7ea/rlr-2.4.5-py2.py3-none-any.whl\n", + "Collecting categorical-distance>=1.9 (from dedupe)\n", + " Downloading https://files.pythonhosted.org/packages/1d/b7/4f97771f52c63916f4e4d349a644c2387961592e76070e7310463b2d70a5/categorical_distance-1.9-py3-none-any.whl\n", + "Requirement already satisfied: numpy>=1.13 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.14.3)\n", + "Collecting fastcluster (from dedupe)\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/1e/9d/3d7525a4722ee4a11ad969762d1de53b6dac326b5ac1366221e06958e1d7/fastcluster-1.1.26-cp36-cp36m-manylinux1_x86_64.whl (154kB)\n", + "\u001b[K 100% |████████████████████████████████| 163kB 707kB/s ta 0:00:01\n", + "\u001b[?25hCollecting highered>=0.2.0 (from dedupe)\n", + " Downloading https://files.pythonhosted.org/packages/81/00/cbd902cfd14ad1992fcdaa11a615d47b36b6136dc690e19b0afa58c7365d/highered-0.2.1-py2.py3-none-any.whl\n", + "Collecting dedupe-hcluster (from dedupe)\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/b2/1f/c6f6075c2e988b3a1759fabaf91d2f8f2de59c6e607a3fd9a2e06112a0de/dedupe_hcluster-0.3.8-cp36-cp36m-manylinux1_x86_64.whl (531kB)\n", + "\u001b[K 100% |████████████████████████████████| 532kB 5.2MB/s ta 0:00:01\n", + "\u001b[?25hCollecting BTrees>=4.1.4 (from dedupe)\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/48/b3/9ce3b32817db98e8bf20d6873e18ee3ee7feded135434d800b72bf8dfb9f/BTrees-4.7.2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)\n", + "\u001b[K 100% |████████████████████████████████| 3.0MB 8.2MB/s eta 0:00:01\n", + "\u001b[?25hCollecting Levenshtein-search (from dedupe)\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/93/89/dc320196d10447540c95f58eab5dd316a2166310356c1d88b84724f4e793/Levenshtein_search-1.4.5-cp36-cp36m-manylinux1_x86_64.whl (59kB)\n", + "\u001b[K 100% |████████████████████████████████| 61kB 21.2MB/s ta 0:00:01\n", + "\u001b[?25hCollecting zope.index (from dedupe)\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ab/0f/f93bddfac1189bb6b973142da3ef2caa6817a59b07ca448095a30b644737/zope.index-5.0.0-cp36-cp36m-manylinux1_x86_64.whl (101kB)\n", + "\u001b[K 100% |████████████████████████████████| 102kB 17.6MB/s a 0:00:01\n", + "\u001b[?25hCollecting typing-extensions (from dedupe)\n", + " Downloading https://files.pythonhosted.org/packages/0c/0e/3f026d0645d699e7320b59952146d56ad7c374e9cd72cd16e7c74e657a0f/typing_extensions-3.7.4.2-py3-none-any.whl\n", + "Collecting affinegap>=1.3 (from dedupe)\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/b2/6a/91f5defe8178104449bc897208c9780b159575d16a959a5074f0bf39a6f0/affinegap-1.11-cp36-cp36m-manylinux1_x86_64.whl (45kB)\n", + "\u001b[K 100% |████████████████████████████████| 51kB 12.0MB/s ta 0:00:01\n", + "\u001b[?25hCollecting doublemetaphone (from dedupe)\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/c0/27/8df369334aac64755ca899b9a7cc4d2d60e800cca148322ef19309cdae0f/DoubleMetaphone-0.1-cp36-cp36m-manylinux1_x86_64.whl (78kB)\n", + "\u001b[K 100% |████████████████████████████████| 81kB 3.4MB/s eta 0:00:01\n", + "\u001b[?25hCollecting dedupe-variable-datetime (from dedupe)\n", + " Downloading https://files.pythonhosted.org/packages/65/8f/d21f6acadcdfd681ee038153883b5673b8b76f790e465d791780e6b7bf60/dedupe_variable_datetime-0.1.5-py3-none-any.whl\n", + "Collecting tenacity>=4.1.0 (from pyathena)\n", + " Downloading https://files.pythonhosted.org/packages/b5/05/ff089032442058bd3386f9cd991cd88ccac81dca1494d78751621ee35e62/tenacity-6.2.0-py2.py3-none-any.whl\n", + "Requirement already satisfied: botocore>=1.5.52 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pyathena) (1.15.39)\n", + "Collecting future (from pyathena)\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/45/0b/38b06fd9b92dc2b68d58b75f900e97884c45bedd2ff83203d933cf5851c9/future-0.18.2.tar.gz (829kB)\n", + "\u001b[K 100% |████████████████████████████████| 829kB 14.2MB/s ta 0:00:01\n", + "\u001b[?25hRequirement already satisfied: boto3>=1.4.4 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pyathena) (1.12.39)\n", + "Collecting pylbfgs (from rlr>=2.4.3->dedupe)\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/b8/5b/b8e1ef62e5e5b034ce5ae919b64158ec8da4f64c995444aec7fd96e8ec42/PyLBFGS-0.2.0.13-cp36-cp36m-manylinux1_x86_64.whl (205kB)\n", + "\u001b[K 100% |████████████████████████████████| 215kB 16.4MB/s ta 0:00:01\n", + "\u001b[?25hCollecting pyhacrf-datamade>=0.2.0 (from highered>=0.2.0->dedupe)\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/84/f5/971e17a8b6686d5fc3d562e29e9c902743eb5f0f4436880b86cb11c0149c/pyhacrf_datamade-0.2.5-cp36-cp36m-manylinux1_x86_64.whl (788kB)\n", + "\u001b[K 100% |████████████████████████████████| 798kB 14.5MB/s ta 0:00:01\n", + "\u001b[?25hCollecting zope.interface (from BTrees>=4.1.4->dedupe)\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/fc/7e/8e1efcfa22b722a0d6e992172ab15a871988c290cb722fe8da6d11f1aeb2/zope.interface-5.1.0-cp36-cp36m-manylinux1_x86_64.whl (234kB)\n", + "\u001b[K 100% |████████████████████████████████| 235kB 16.6MB/s ta 0:00:01\n", + "\u001b[?25hCollecting persistent>=4.1.0 (from BTrees>=4.1.4->dedupe)\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/2e/4e/9bde9a2f63273f2e63a94a8198781aac559cc6efd2f560d69afcb0d9d8b5/persistent-4.6.4-cp36-cp36m-manylinux1_x86_64.whl (246kB)\n", + "\u001b[K 100% |████████████████████████████████| 256kB 17.5MB/s ta 0:00:01\n", + "\u001b[?25hRequirement already satisfied: six in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from zope.index->dedupe) (1.11.0)\n", + "Requirement already satisfied: setuptools in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from zope.index->dedupe) (39.1.0)\n", + "Collecting datetime-distance (from dedupe-variable-datetime->dedupe)\n", + " Downloading https://files.pythonhosted.org/packages/6b/98/a5eff9256ff27e3bb8030466dabd772002e5014b9237cbeb18c542050ff5/datetime_distance-0.1.3-py3-none-any.whl\n", + "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from botocore>=1.5.52->pyathena) (2.7.3)\n", + "Requirement already satisfied: docutils<0.16,>=0.10 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from botocore>=1.5.52->pyathena) (0.14)\n", + "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from botocore>=1.5.52->pyathena) (0.9.4)\n", + "Requirement already satisfied: urllib3<1.26,>=1.20; python_version != \"3.4\" in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from botocore>=1.5.52->pyathena) (1.23)\n", + "Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from boto3>=1.4.4->pyathena) (0.3.3)\n", + "Requirement already satisfied: cffi; platform_python_implementation == \"CPython\" in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from persistent>=4.1.0->BTrees>=4.1.4->dedupe) (1.11.5)\n", + "Requirement already satisfied: pycparser in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from cffi; platform_python_implementation == \"CPython\"->persistent>=4.1.0->BTrees>=4.1.4->dedupe) (2.18)\n", + "Building wheels for collected packages: future\n", + " Running setup.py bdist_wheel for future ... \u001b[?25ldone\n", + "\u001b[?25h Stored in directory: /home/ec2-user/.cache/pip/wheels/8b/99/a0/81daf51dcd359a9377b110a8a886b3895921802d2fc1b2397e\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Successfully built future\n", + "Installing collected packages: haversine, simplecosine, future, pylbfgs, rlr, categorical-distance, fastcluster, pyhacrf-datamade, highered, dedupe-hcluster, zope.interface, persistent, BTrees, Levenshtein-search, zope.index, typing-extensions, affinegap, doublemetaphone, datetime-distance, dedupe-variable-datetime, dedupe, tenacity, pyathena\n", + "Successfully installed BTrees-4.7.2 Levenshtein-search-1.4.5 affinegap-1.11 categorical-distance-1.9 datetime-distance-0.1.3 dedupe-2.0.3 dedupe-hcluster-0.3.8 dedupe-variable-datetime-0.1.5 doublemetaphone-0.1 fastcluster-1.1.26 future-0.18.2 haversine-2.2.0 highered-0.2.1 persistent-4.6.4 pyathena-1.10.8 pyhacrf-datamade-0.2.5 pylbfgs-0.2.0.13 rlr-2.4.5 simplecosine-1.2 tenacity-6.2.0 typing-extensions-3.7.4.2 zope.index-5.0.0 zope.interface-5.1.0\n", + "\u001b[33mYou are using pip version 10.0.1, however version 20.2b1 is available.\n", + "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n" + ] + } + ], "source": [ "!pip install dedupe pyathena" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "athena_garbage = 's3://com.ria.scratch/athena_garbage/'\n", - "bucket='com.ria.scratch'\n", - "region='eu-west-1'\n", - "workgroup = 'RIA'\n", - "root_key='as-dedupe/'\n", - "schema_name='ria_data_science_s3'\n", "import sys\n", - "sys.path.insert(0, '../../dedupe/')\n", - "import dedupe\n", - "from io import StringIO\n", - "import csv" + "sys.path.insert(0, '../athena_example/')\n", + "import config\n" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "AttributeError", + "evalue": "module 'logging' has no attribute 'logging'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 85\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 86\u001b[0m \u001b[0;31m## Armin\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 87\u001b[0;31m \u001b[0mlog_level\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlogging\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogging\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDEBUG\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 88\u001b[0m \u001b[0;31m#######\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 89\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAttributeError\u001b[0m: module 'logging' has no attribute 'logging'" + ] + } + ], "source": [ "# %load ../mysql_example/mysql_example.py\n", "#!/usr/bin/python\n", @@ -61,16 +158,18 @@ "import optparse\n", "import locale\n", "import json\n", + "from io import StringIO\n", + "import csv\n", "import pandas as pd\n", "\n", "# import MySQLdb\n", "# import MySQLdb.cursors\n", "\n", - "import dedupe\n", - "import dedupe.backport\n", "import boto3\n", "from pyathena import connect\n", "from pyathena.pandas_cursor import PandasCursor\n", + "import dedupe\n", + "import dedupe.backport\n", "\n", "def dict_cursor_execute(cur, query):\n", " df = cur.execute(query).as_pandas()\n", @@ -118,7 +217,7 @@ "# log_level = logging.DEBUG\n", "\n", "## Armin\n", - " log_level = logging.WARNING\n", + " log_level = logging.DEBUG\n", "#######\n", "\n", " logging.getLogger().setLevel(log_level)\n", @@ -150,14 +249,17 @@ "# read_default_file=MYSQL_CNF)\n", "\n", " s3 = boto3.client('s3') \n", - " conn = connect(s3_staging_dir=athena_garbage,\n", - " region_name=region, work_group=workgroup)\n", - " cur = conn.cursor(PandasCursor, schema_name=schema_name)" + " conn = connect(aws_access_key_id=config.ACCESS_KEY_ID,\n", + " aws_secret_access_key=config.SECRET_ACCESS_KEY,\n", + " s3_staging_dir=config.ATHENA_GARBAGE_PATH,\n", + " region_name=config.REGION, \n", + " work_group=config.WORKGROUP) \n", + " cur = conn.cursor(PandasCursor, schema_name=config.SCHEMA_NAME)" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -167,17 +269,9 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "reading from mysql_example_settings\n" - ] - } - ], + "outputs": [], "source": [ " # We'll be using variations on this following select statement to pull\n", " # in campaign donor info.\n", @@ -269,28 +363,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "blocking...\n", - "creating blocking_map database\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ " # ## Blocking\n", "\n", @@ -320,23 +395,15 @@ " 'classification'='csv', \n", " --'skip.header.line.count'='1', \n", " 'serialization.null.format'='')\n", - " '''.format(bucket, root_key+'blocking_map') \n", + " '''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'blocking_map') \n", " cur.execute(q)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "creating inverted index\n" - ] - } - ], + "outputs": [], "source": [ " # If dedupe learned a Index Predicate, we have to take a pass\n", " # through the data and create indices.\n", @@ -356,39 +423,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "writing blocking map\n" - ] - }, - { - "data": { - "text/plain": [ - "{'ResponseMetadata': {'RequestId': '5F215F152B811909',\n", - " 'HostId': 'B9k8koPR2pp/7lp5WxlEM2etPGjhR3aUdlJq253YoSf1Rt6N8Jo1XAWrfe7EiplzFf++YlcW238=',\n", - " 'HTTPStatusCode': 200,\n", - " 'HTTPHeaders': {'x-amz-id-2': 'B9k8koPR2pp/7lp5WxlEM2etPGjhR3aUdlJq253YoSf1Rt6N8Jo1XAWrfe7EiplzFf++YlcW238=',\n", - " 'x-amz-request-id': '5F215F152B811909',\n", - " 'date': 'Tue, 30 Jun 2020 15:27:16 GMT',\n", - " 'x-amz-server-side-encryption': 'AES256',\n", - " 'etag': '\"d41d8cd98f00b204e9800998ecf8427e\"',\n", - " 'content-length': '0',\n", - " 'server': 'AmazonS3'},\n", - " 'RetryAttempts': 0},\n", - " 'ETag': '\"d41d8cd98f00b204e9800998ecf8427e\"',\n", - " 'ServerSideEncryption': 'AES256'}" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ " # Now we are ready to write our blocking map table by creating a\n", " # generator that yields unique `(block_key, donor_id)` tuples.\n", @@ -396,7 +433,22 @@ " \n", "\n", " read_cur_dict = dict_cursor_execute(cur, DONOR_SELECT).to_dict('records')\n", - " full_data = ((row['donor_id'], row) for row in read_cur_dict)\n", + " full_data = ((row['donor_id'], row) for row in read_cur_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ " b_data = deduper.fingerprinter(full_data)\n", " buffer = pd.DataFrame.from_records(b_data).to_csv(index=False, header=False, sep='\\t')\n", "# csv_out.writerows(b_data) \n", @@ -406,13 +458,13 @@ "\n", "# write_cur.executemany(\"INSERT INTO blocking_map VALUES (%s, %s)\",\n", "# b_data)\n", - " s3.put_object(Bucket=bucket, Key=root_key+'blocking_map/blocking.csv', Body=buffer) \n", + " s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'blocking_map/blocking.csv', Body=buffer) \n", "# write_con.commit()" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -447,51 +499,9 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "ename": "StopIteration", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mStopIteration\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mread_cur_dict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mStopIteration\u001b[0m: " - ] - } - ], - "source": [ - "next(read_cur_dict)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "clustering...\n" - ] - }, - { - "ename": "BlockingError", - "evalue": "No records have been blocked together. Is the data you are trying to match like the data you trained on?", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mBlockingError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'clustering...'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur_dict)),\n\u001b[0m\u001b[1;32m 5\u001b[0m threshold=0.5)\n", - "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/api.py\u001b[0m in \u001b[0;36mscore\u001b[0;34m(self, pairs)\u001b[0m\n\u001b[1;32m 104\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata_model\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 105\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclassifier\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 106\u001b[0;31m self.num_cores)\n\u001b[0m\u001b[1;32m 107\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 108\u001b[0m raise RuntimeError('''\n", - "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/core.py\u001b[0m in \u001b[0;36mscoreDuplicates\u001b[0;34m(record_pairs, data_model, classifier, num_cores)\u001b[0m\n\u001b[1;32m 218\u001b[0m \u001b[0mfirst\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrecord_pairs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpeek\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrecord_pairs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 219\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mfirst\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 220\u001b[0;31m raise BlockingError(\"No records have been blocked together. \"\n\u001b[0m\u001b[1;32m 221\u001b[0m \u001b[0;34m\"Is the data you are trying to match like \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 222\u001b[0m \"the data you trained on?\")\n", - "\u001b[0;31mBlockingError\u001b[0m: No records have been blocked together. Is the data you are trying to match like the data you trained on?" - ] - } - ], + "outputs": [], "source": [ " # ## Clustering\n", "\n", @@ -611,6 +621,82 @@ "\n", " print('ran in', time.time() - start_time, 'seconds')" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# I'm here\n", + "Found a way to map block_key to block_numbers\n", + "** CREATE TABLE, according to some thing online, has more timeout!\n", + "** Looks like i should be using (bucketing)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Problem:\n", + "The athena mapping doesn't have many distinct values, a huge number for example have 6061:None:2, while there is only one like this in sql!?\n", + "The problem, probably was probably address, the concat was buggy and there were too many nulls.\n", + "Still while raw table matches, donors don't! The athena is too much bigger\n", + "Start from here: Run this query on both, the results are different" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "create table as_blocking_map_number\n", + "with (bucketed_by = block_number)\n", + "as( \n", + " SELECT donor_id, dense_rank() over (ORDER BY block_key) as block_number\n", + " from blocking_map)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "import sys\n", + "sys.path.insert(0, '../athena_example/')\n", + "from pyathena import connect\n", + "from pyathena.pandas_cursor import PandasCursor\n", + "\n", + "import config\n", + "\n", + "conn = connect(aws_access_key_id=config.ACCESS_KEY_ID,\n", + " aws_secret_access_key=config.SECRET_ACCESS_KEY,\n", + " s3_staging_dir=config.ATHENA_GARBAGE_PATH,\n", + " region_name=config.REGION, \n", + " work_group=config.WORKGROUP) \n", + "cur = conn.cursor(PandasCursor, schema_name=config.SCHEMA_NAME)\n", + "q='''\n", + "with blocking_map_number as( \n", + " SELECT donor_id, dense_rank() over (ORDER BY block_key) as block_number\n", + " from blocking_map)\n", + "create table donor_id_pairs as (\n", + " SELECT DISTINCT l.donor_id as east, r.donor_id as west\n", + " from blocking_map_number as l\n", + " INNER JOIN blocking_map_number as r\n", + " using (block_number)\n", + " where l.donor_id < r.donor_id)\n", + "'''\n", + "cur.execute(q)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -629,7 +715,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.6.10" } }, "nbformat": 4, diff --git a/notebooks/athena_init_db.ipynb b/notebooks/athena_init_db.ipynb index b86d1b9f..5e8a5a32 100644 --- a/notebooks/athena_init_db.ipynb +++ b/notebooks/athena_init_db.ipynb @@ -21,6 +21,7 @@ "ATHENA_GARBAGE_PATH = 's3://com.ria.scratch/athena_garbage/'\n", "WORKGROUP = 'RIA'\n", "REGION = 'eu-west-1'\n", + "SCHEMA_NAME = 'ria_data_science_s3'\n", "\n", "# Database Parameters\n", "DATABASE_BUCKET = 'com.ria.scratch'\n", @@ -29,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -67,6 +68,7 @@ "import boto3\n", "from pyathena import connect\n", "import config\n", + "import csv\n", "\n", "\n", "contributions_zip_file = 'Illinois-campaign-contributions.txt.zip'\n", @@ -89,26 +91,27 @@ " zip_file.close()\n", "\n", "\n", - "s3 = boto3.client('s3') \n", "\n", + "def as_pandas(query, **kwrgs):\n", + " return utils.athena_to_panda(query, escapechar='\\\\', dtype='object', keep_default_na=False, na_values=[''], **kwrgs)\n", "\n", "conn = connect(aws_access_key_id=config.ACCESS_KEY_ID,\n", " aws_secret_access_key=config.SECRET_ACCESS_KEY,\n", " s3_staging_dir=config.ATHENA_GARBAGE_PATH,\n", " region_name=config.REGION, \n", " work_group=config.WORKGROUP)\n", - "c = conn.cursor()\n", + "c = conn.cursor(schema_name=config.SCHEMA_NAME)\n", "\n", "print('importing raw data from csv...')\n", - "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.raw_table\")\n", - "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.donors\")\n", - "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.recipients\")\n", - "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.contributions\")\n", - "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.processed_donors\")\n", + "utils.athena_start_query(\"DROP TABLE IF EXISTS raw_table\")\n", + "utils.athena_start_query(\"DROP TABLE IF EXISTS donors\")\n", + "utils.athena_start_query(\"DROP TABLE IF EXISTS recipients\")\n", + "utils.athena_start_query(\"DROP TABLE IF EXISTS contributions\")\n", + "utils.athena_start_query(\"DROP TABLE IF EXISTS processed_donors\")\n", "\n", "\n", "q=r'''\n", - "CREATE EXTERNAL TABLE ria_data_science_s3.raw_table \n", + "CREATE EXTERNAL TABLE raw_table \n", " (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), \n", " address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), \n", " state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), \n", @@ -125,6 +128,7 @@ "ROW FORMAT DELIMITED\n", " FIELDS TERMINATED BY '\\t'\n", " LINES TERMINATED BY '\\n' \n", + " ESCAPED BY '\\\\'\n", "LOCATION\n", " 's3://{}/{}' \n", "TBLPROPERTIES (\n", @@ -132,11 +136,12 @@ " 'skip.header.line.count'='1', \n", " 'serialization.null.format'='')\n", "'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'raw_table') \n", - "c.execute(q)\n", + "utils.athena_start_query(q)\n", "\n", "\n", + "df = pd.read_csv(contributions_txt_file, sep='\\t', escapechar='\\\\', quoting=csv.QUOTE_NONE, \n", + " error_bad_lines=False, warn_bad_lines=True, dtype=str, keep_default_na=False, na_values=[''])#,\n", "\n", - "df = pd.read_csv(contributions_txt_file, sep='\\t', error_bad_lines=False, dtype=str, index_col=0)\n", "# Remove the very few records that mess up the demo \n", "# (demo purposes only! Don't do something like this in production)\n", "df = df[df['RcvDate'].str.len()>=10]\n", @@ -147,21 +152,25 @@ "df.loc[df['RptPdEndDate'].str.len()<10,'RptPdEndDate'] = np.nan\n", "\n", "#committee ID is requred. Remove the 2 rows that don't have it.\n", - "df = df[df['ID'] != '']\n", + "df = df[df['ID']!='']\n", "\n", "# There's a record with a date stuck in the committee_id column, which causes\n", "# problems when inserting into the contributions table below. Get rid of it this \n", "# way.\n", "df = df[df['ID'].str.len() <=9]\n", "\n", - "# Nullifying empty strings\n", - "df = df.replace(r'^\\s*$', np.nan, regex=True)\n", + "# dropping the last columns\n", + "df = df.drop(columns='Unnamed: 29')\n", "\n", - "s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'raw_table/'+contributions_txt_file, Body=df.to_csv(sep=\"\\t\"))\n", + "# Nullifying empty strings\n", + "# df = df.replace(r'^\\s*$', np.nan, regex=True)\n", + "df_lower=df.apply(lambda x: x.str.lower() if x.dtype=='object' else x, result_type='expand')\n", + "utils.write(body=df_lower.to_csv(quoting=csv.QUOTE_NONE, sep=\"\\t\", escapechar='\\\\', index=None),\n", + " filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'raw_table', contributions_txt_file,))\n", "\n", "print('creating donors table...')\n", "q='''\n", - "CREATE TABLE ria_data_science_s3.donors as\n", + "CREATE TABLE donors as\n", " with tmp as\n", " (SELECT DISTINCT \n", " TRIM(last_name) as last_name, TRIM(first_name) as first_name, \n", @@ -169,20 +178,20 @@ " TRIM(city) city, TRIM(state) as state, \n", " TRIM(zip) as zip, TRIM(employer) as employer, \n", " TRIM(occupation) as occupation\n", - " FROM ria_data_science_s3.raw_table)\n", + " FROM raw_table)\n", " SELECT row_number() over () as donor_id, * from tmp'''\n", - "c.execute(q)\n", + "utils.athena_start_query(q)\n", "\n", "\n", "q='''\n", - "CREATE TABLE ria_data_science_s3.recipients as\n", - " SELECT DISTINCT committee_id, committee_name FROM ria_data_science_s3.raw_table\n", + "CREATE TABLE recipients as\n", + " SELECT DISTINCT committee_id, committee_name FROM raw_table\n", "'''\n", - "c.execute(q)\n", + "utils.athena_start_query(q)\n", "\n", "print('creating contributions table')\n", "q='''\n", - "CREATE TABLE ria_data_science_s3.contributions as\n", + "CREATE TABLE contributions as\n", " SELECT reciept_id, donors.donor_id, committee_id, \n", " report_type, date_parse(date_recieved, '%m/%d/%Y') as date_recieved, \n", " loan_amount, amount, \n", @@ -192,7 +201,7 @@ " election_type, election_year, \n", " date_parse(report_period_begin, '%m/%d/%Y') as report_period_begin, \n", " date_parse(report_period_end, '%m/%d/%Y') as report_period_end \n", - " FROM ria_data_science_s3.raw_table JOIN ria_data_science_s3.donors ON \n", + " FROM raw_table JOIN donors ON \n", " donors.first_name = TRIM(raw_table.first_name) AND \n", " donors.last_name = TRIM(raw_table.last_name) AND \n", " donors.address_1 = TRIM(raw_table.address_1) AND \n", @@ -202,27 +211,27 @@ " donors.employer = TRIM(raw_table.employer) AND \n", " donors.occupation = TRIM(raw_table.occupation) AND \n", " donors.zip = TRIM(raw_table.zip)'''\n", - "c.execute(q)\n", + "utils.athena_start_query(q)\n", "\n", "q = '''\n", - "CREATE TABLE ria_data_science_s3.processed_donors AS \n", + "CREATE TABLE processed_donors AS \n", " SELECT donor_id, \n", " LOWER(city) AS city, \n", " CASE WHEN (first_name IS NULL AND last_name IS NULL) \n", " THEN NULL \n", - " ELSE LOWER(CONCAT(first_name, ' ', last_name)) \n", + " ELSE LOWER(array_join(filter(array[first_name, last_name], x-> x IS NOT NULL), ' ')) \n", " END AS name, \n", " LOWER(zip) AS zip, \n", " LOWER(state) AS state, \n", " CASE WHEN (address_1 IS NULL AND address_2 IS NULL) \n", " THEN NULL \n", - " ELSE LOWER(CONCAT(address_1, ' ', address_2)) \n", + " ELSE LOWER(array_join(filter(array[address_1, address_1], x-> x IS NOT NULL), ' '))\n", " END AS address, \n", " LOWER(occupation) AS occupation, \n", " LOWER(employer) AS employer, \n", " first_name is null AS person \n", - " FROM ria_data_science_s3.donors'''\n", - "c.execute(q)\n", + " FROM donors'''\n", + "utils.athena_start_query(q)\n", "\n", "\n", "\n", @@ -232,7 +241,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -245,6 +254,11 @@ "b'Skipping line 1495732: expected 30 fields, saw 31\\n'\n", "b'Skipping line 1631504: expected 30 fields, saw 31\\nSkipping line 1631506: expected 30 fields, saw 31\\n'\n", "b'Skipping line 1660260: expected 30 fields, saw 31\\nSkipping line 1660264: expected 30 fields, saw 32\\n'\n", + "b'Skipping line 1441352: expected 30 fields, saw 31\\n'\n", + "b'Skipping line 1465996: expected 30 fields, saw 31\\n'\n", + "b'Skipping line 1495732: expected 30 fields, saw 31\\n'\n", + "b'Skipping line 1631504: expected 30 fields, saw 31\\nSkipping line 1631506: expected 30 fields, saw 31\\n'\n", + "b'Skipping line 1660260: expected 30 fields, saw 31\\nSkipping line 1660264: expected 30 fields, saw 32\\n'\n", "creating donors table...\n", "creating contributions table\n", "done\n" @@ -279,7 +293,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.6.10" } }, "nbformat": 4, From 409527de23e14336245cc9683ef3d55e3e9d852a Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Fri, 31 Jul 2020 06:25:17 +0000 Subject: [PATCH 07/19] athena-example, first version --- .gitignore | 1 + README.md | 62 ++--- athena_example/README.md | 15 +- athena_example/athena_example.py | 411 +++++++++++++++++++++++++++++++ athena_example/athena_init.py | 215 ++++++++++++++++ athena_example/config.py | 12 + athena_example/mysql.cnf_LOCAL | 4 - athena_example/mysql_example.py | 344 -------------------------- athena_example/mysql_init_db.py | 234 ------------------ athena_example/utils.py | 138 +++++++++++ notebooks/athena_example.ipynb | 408 +++++++----------------------- notebooks/athena_init_db.ipynb | 99 +++++--- 12 files changed, 941 insertions(+), 1002 deletions(-) create mode 100644 athena_example/athena_example.py create mode 100644 athena_example/athena_init.py create mode 100644 athena_example/config.py delete mode 100644 athena_example/mysql.cnf_LOCAL delete mode 100644 athena_example/mysql_example.py delete mode 100644 athena_example/mysql_init_db.py create mode 100644 athena_example/utils.py diff --git a/.gitignore b/.gitignore index 3fb24683..a29de92b 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,4 @@ ENV distpgsql_init_db.py pgsql_example/pgsql_init_db.py .idea +.ipynb_checkpoints* diff --git a/README.md b/README.md index bb1fe2c8..82abad3d 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ # Dedupe Examples -Example scripts for the [dedupe](https://github.com/dedupeio/dedupe), a library that uses machine learning to perform de-duplication and entity resolution quickly on structured data. +Adding Athena Example scripts for the [dedupe](https://github.com/dedupeio/dedupe), a library that uses machine learning to perform de-duplication and entity resolution quickly on structured data. Part of the [Dedupe.io](https://dedupe.io/) cloud service and open source toolset for de-duplicating and finding fuzzy matches in your data. For more details, see the [differences between Dedupe.io and the dedupe library](https://dedupe.io/documentation/should-i-use-dedupeio-or-the-dedupe-python-library.html). -To get these examples: +To get the athena examples: ```bash -git clone https://github.com/dedupeio/dedupe-examples.git +git clone https://github.com/asajadi/dedupe-examples.git cd dedupe-examples ``` @@ -34,61 +34,29 @@ Afterwards, whenever you want to work on dedupe-examples, workon dedupe-examples ``` -### [CSV example](https://dedupeio.github.io/dedupe-examples/docs/csv_example.html) - early childhood locations -This example works with a list of early childhood education sites in Chicago from 10 different sources. +### [athena example](https://dedupeio.github.io/dedupe-examples/docs/mysql_example.html) - IL campaign contributions -```bash -cd csv_example -pip install unidecode -python csv_example.py -``` - (use 'y', 'n' and 'u' keys to flag duplicates for active learning, 'f' when you are finished) - -**To see how you might use dedupe with smallish data, see the [annotated source code for csv_example.py](https://dedupeio.github.io/dedupe-examples/docs/csv_example.html).** - -### [Patent example](https://dedupeio.github.io/dedupe-examples/docs/patent_example.html) - patent holders - -This example works with Dutch inventors from the PATSTAT international patent data file - -```bash -cd patent_example -pip install unidecode -python patent_example.py -``` - (use 'y', 'n' and 'u' keys to flag duplicates for active learning, 'f' when you are finished) +Takes a database of IL campaign contribution data, loads it in to a +Athena database, and identifies the unique donors. -### [Record Linkage example](https://dedupeio.github.io/dedupe-examples/docs/record_linkage_example.html) - electronics products -This example links two spreadsheets of electronics products and links up the matching entries. Each dataset individually has no duplicates. +To follow this example you need to -```bash -cd record_linkage_example -python record_linkage_example.py -``` +* Create a Athena database called 'contributions' +* Update `athena_example/config.py` with your Athena credentials +* Install dependencies, `pip install -r requirements.txt` -**To see how you might use dedupe for linking datasets, see the [annotated source code for record_linkage_example.py](https://dedupeio.github.io/dedupe-examples/docs/record_linkage_example.html).** - -### [Gazetteer example](https://dedupeio.github.io/dedupe-examples/docs/gazetteer_example.html) - electronics products -This example links two spreadsheets of electronics products and links up the matching entries using the Gazetteer class +Once that's all done you can run the example: ```bash -cd gazetteer_example.py -python gazetteer_example.py +cd mysql_example +python athena_init_db.py +python athena_example.py ``` + (use 'y', 'n' and 'u' keys to flag duplicates for active learning, 'f' when you are finished) -### [MySQL example](https://dedupeio.github.io/dedupe-examples/docs/mysql_example.html) - IL campaign contributions - -See `mysql_example/README.md` for details - -**To see how you might use dedupe with bigish data, see the [annotated source code for mysql_example](https://dedupeio.github.io/dedupe-examples/docs/mysql_example.html).** - - -### [PostgreSQL big dedupe example](https://dedupeio.github.io/dedupe-examples/docs/pgsql_big_dedupe_example.html) - PostgreSQL example on large dataset - -See `pgsql_big_dedupe_example/README.md` for details -This is the same example as the MySQL IL campaign contributions dataset above, but ported to run on PostgreSQL. ## Training diff --git a/athena_example/README.md b/athena_example/README.md index a027b3b0..3530935d 100644 --- a/athena_example/README.md +++ b/athena_example/README.md @@ -1,23 +1,20 @@ -# MySQL Example +# Athena Example Takes a database of IL campaign contribution data, loads it in to a -MySQL database, and identifies the unique donors. This can take a few -hours and will noticeably tax your laptop. You might want to run it -overnight. +Athena database, and identifies the unique donors. To follow this example you need to -* Create a MySQL database called 'contributions' -* Copy `mysql_example/mysql.cnf_LOCAL` to `mysql_example/mysql.cnf` -* Update `mysql_example/mysql.cnf` with your MySQL username and password +* Create a Athena database called 'contributions' +* Update `athena_example/config.py` with your Athena credentials * Install dependencies, `pip install -r requirements.txt` Once that's all done you can run the example: ```bash cd mysql_example -python mysql_init_db.py -python mysql_example.py +python athena_init_db.py +python athena_example.py ``` (use 'y', 'n' and 'u' keys to flag duplicates for active learning, 'f' when you are finished) diff --git a/athena_example/athena_example.py b/athena_example/athena_example.py new file mode 100644 index 00000000..a738c56c --- /dev/null +++ b/athena_example/athena_example.py @@ -0,0 +1,411 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[ ]: + + +# %load ../mysql_example/mysql_example.py +#!/usr/bin/python + +""" +This is an example of working with very large data. There are about +700,000 unduplicated donors in this database of Illinois political +campaign contributions. + +With such a large set of input data, we cannot store all the comparisons +we need to make in memory. Instead, we will read the pairs on demand +from the MySQL database. + +__Note:__ You will need to run `python mysql_init_db.py` +before running this script. See the annotates source for +[mysql_init_db.py](mysql_init_db.html) + +For smaller datasets (<10,000), see our +[csv_example](csv_example.html) +""" + +# There is a little bit difference between the result +# of this module and the mysql one. The reason is due to +# Some special (and mostly erroneous) characters, such as \a .. +# Which are dealt with differently by mysql and athena/panda + +import sys +import os +import itertools +import time +import logging +import optparse +import locale +import json +from io import StringIO +import csv +import pandas as pd + +import boto3 +import dedupe +import dedupe.backport +sys.path.insert(0, '../athena_example/') +import config +sys.path.insert(0, '../athena_example/') +import utils + +def as_pandas(query, **kwrgs): + df = utils.athena_to_panda(query, escapechar=None, keep_default_na=False, na_values=[''], **kwrgs) + return df.where(pd.notnull(df), None) + +def record_pairs(result_set): + for i, row in enumerate(result_set): + a_record_id, a_record, b_record_id, b_record = row + record_a = (a_record_id, json.loads(a_record)) + record_b = (b_record_id, json.loads(b_record)) + + yield record_a, record_b + + if i % 10000 == 0: + print(i) + + +def cluster_ids(clustered_dupes): + + for cluster, scores in clustered_dupes: + cluster_id = cluster[0] + for donor_id, score in zip(cluster, scores): + yield donor_id, cluster_id, score + + +if __name__ == '__main__': + + # ## Logging + + # Dedupe uses Python logging to show or suppress verbose output. Added + # for convenience. To enable verbose output, run `python + # examples/mysql_example/mysql_example.py -v` + + optp = optparse.OptionParser() + optp.add_option('-v', '--verbose', dest='verbose', action='count', + help='Increase verbosity (specify multiple times for more)' + ) + (opts, args) = optp.parse_args() + log_level = logging.WARNING + if opts.verbose: + if opts.verbose == 1: + log_level = logging.INFO + elif opts.verbose >= 2: + log_level = logging.DEBUG + + + logging.getLogger().setLevel(log_level) + + + + + settings_file = 'mysql_example_settings' + training_file = 'mysql_example_training.json' + + start_time = time.time() + + +# In[ ]: + + +# We'll be using variations on this following select statement to pull +# in campaign donor info. +# +# We did a fair amount of preprocessing of the fields in +# `mysql_init_db.py` +DONOR_SELECT = "SELECT donor_id, city, name, zip, state, address " "from processed_donors" + +# ## Training + +if os.path.exists(settings_file): + print('reading from ', settings_file) + with open(settings_file, 'rb') as sf: + deduper = dedupe.StaticDedupe(sf, num_cores=4) +else: + # Define the fields dedupe will pay attention to + # + # The address, city, and zip fields are often missing, so we'll + # tell dedupe that, and we'll learn a model that take that into + # account + fields = [{'field': 'name', 'type': 'String'}, + {'field': 'address', 'type': 'String', + 'has missing': True}, + {'field': 'city', 'type': 'ShortString', 'has missing': True}, + {'field': 'state', 'type': 'ShortString', 'has missing': True}, + {'field': 'zip', 'type': 'ShortString', 'has missing': True}, + ] + + # Create a new deduper object and pass our data model to it. + deduper = dedupe.Dedupe(fields, num_cores=4) + + # We will sample pairs from the entire donor table for training +# with read_con.cursor() as cur: + + # Armin: The problem is the donor_id, it's numpy's int64, should be converted to int! + # But for that, astype doesn't work, and a loop on temp_d is slow, so for now let's just use str +# with conn.cursor(PandasCursor, schema_name=schema_name) as cursor: + temp_df = as_pandas(DONOR_SELECT) + temp_d = temp_df.to_dict('index') + + + # If we have training data saved from a previous run of dedupe, + # look for it an load it in. + # + # __Note:__ if you want to train from + # scratch, delete the training_file + if os.path.exists(training_file): + print('reading labeled examples from ', training_file) + with open(training_file) as tf: + deduper.prepare_training(temp_d, training_file=tf) + else: + deduper.prepare_training(temp_d) + + del temp_d + + # ## Active learning + + print('starting active labeling...') + # Starts the training loop. Dedupe will find the next pair of records + # it is least certain about and ask you to label them as duplicates + # or not. + + # use 'y', 'n' and 'u' keys to flag duplicates + # press 'f' when you are finished + dedupe.convenience.console_label(deduper) + # When finished, save our labeled, training pairs to disk + with open(training_file, 'w') as tf: + deduper.write_training(tf) + + # Notice our the argument here + # + # `recall` is the proportion of true dupes pairs that the learned + # rules must cover. You may want to reduce this if your are making + # too many blocks and too many comparisons. + deduper.train(recall=0.90) + + with open(settings_file, 'wb') as sf: + deduper.write_settings(sf) + + # We can now remove some of the memory hobbing objects we used + # for training + deduper.cleanup_training() + + +# In[ ]: + + +# ## Blocking + +print('blocking...') + +# To run blocking on such a large set of data, we create a separate table +# that contains blocking keys and record ids +print('creating blocking_map database') +utils.athena_start_query("DROP TABLE IF EXISTS blocking_map") + +q=''' +CREATE EXTERNAL TABLE blocking_map + (block_key VARCHAR(200), donor_id INTEGER) +ROW FORMAT DELIMITED + FIELDS TERMINATED BY '\t' + LINES TERMINATED BY '\n' +LOCATION + 's3://{}/{}' +TBLPROPERTIES ( + 'classification'='csv', + --'skip.header.line.count'='1', + 'serialization.null.format'='') +'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'blocking_map') +utils.athena_start_query(q) + + +# In[ ]: + + +# If dedupe learned a Index Predicate, we have to take a pass +# through the data and create indices. +print('creating inverted index') + +# Armin: +# This never runs, index_fields is empty, possible bug? +for field in deduper.fingerprinter.index_fields: + q = ''' + SELECT DISTINCT {field} FROM processed_donors + WHERE {field} IS NOT NULL + '''.format(field=field) + cur_df = as_pandas(q) + # Do I need to cast it as a list? + field_data = cur_df[field] + deduper.fingerprinter.index(field_data, field) + + + +# In[ ]: + + +# Now we are ready to write our blocking map table by creating a +# generator that yields unique `(block_key, donor_id)` tuples. +print('writing blocking map') + + +read_cur_dict = as_pandas(DONOR_SELECT).to_dict('records') +full_data = ((row['donor_id'], row) for row in read_cur_dict) + + +# In[ ]: + + +b_data = deduper.fingerprinter(full_data) +buffer = pd.DataFrame.from_records(b_data).to_csv(index=False, header=False, sep='\t') utils.s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'blocking_map/blocking.csv', Body=buffer) + + +# In[ ]: + + + + # select unique pairs to compare + q=''' + SELECT a.donor_id, + json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], + ARRAY[ a.city, a.name, a.zip, a.state, a.address]) + AS JSON)), + b.donor_id, + json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], + ARRAY[ b.city, b.name, b.zip, b.state, b.address]) + AS JSON)) + FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west + from blocking_map as l + INNER JOIN blocking_map as r + using (block_key) + where l.donor_id < r.donor_id) ids + INNER JOIN processed_donors a on ids.east=a.donor_id + INNER JOIN processed_donors b on ids.west=b.donor_id + ''' + read_cur_dict=as_pandas(q).itertuples(index=False, name=None) + + +# In[ ]: + + +# ## Clustering + +print('clustering...') +clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur_dict)), + threshold=0.5) + + +# In[ ]: + + +utils.athena_start_query("DROP TABLE IF EXISTS entity_map") + +print('creating entity_map database') +q=''' +CREATE EXTERNAL TABLE entity_map + (donor_id INTEGER, canon_id INTEGER, + cluster_score FLOAT) +ROW FORMAT DELIMITED + FIELDS TERMINATED BY '\t' + LINES TERMINATED BY '\n' +LOCATION + 's3://{}/{}' +TBLPROPERTIES ( + 'classification'='csv', + --'skip.header.line.count'='1', + 'serialization.null.format'='') +'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'entity_map') +utils.athena_start_query(q) + +buffer = pd.DataFrame.from_records(cluster_ids(clustered_dupes)).to_csv(index=False, header=False, sep='\t') +utils.s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'entity_map/entity_map.csv', Body=buffer) + + +# In[ ]: + + +# Print out the number of duplicates found +print('# duplicate sets') + +# ## Payoff + +# With all this done, we can now begin to ask interesting questions +# of the data +# +# For example, let's see who the top 10 donors are. + +locale.setlocale(locale.LC_ALL, 'en_CA.UTF-8') # for pretty printing numbers + +utils.athena_start_query("DROP TABLE IF EXISTS e_map") +q = ''' +CREATE TABLE e_map as + SELECT COALESCE(canon_id, entity_map.donor_id) AS canon_id, entity_map.donor_id + FROM entity_map + RIGHT JOIN donors USING(donor_id) +''' + +utils.athena_start_query(q) +q =''' +SELECT array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name, + donation_totals.totals AS totals +FROM donors INNER JOIN + (SELECT canon_id, SUM(cast (amount as double)) AS totals + FROM contributions INNER JOIN e_map + USING (donor_id) + GROUP BY (canon_id) + ORDER BY totals + DESC LIMIT 10) + AS donation_totals +ON donors.donor_id = donation_totals.canon_id +ORDER BY totals DESC +''' +cur_dict = as_pandas(q).to_dict('records') + +print("Top Donors (deduped)") +for row in cur_dict: + row['totals'] = locale.currency(row['totals'], grouping=True) + print('%(totals)20s: %(name)s' % row) + +# Compare this to what we would have gotten if we hadn't done any +# deduplication + +q = ''' +with donorscontributions as( + + SELECT donors.donor_id, + array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name, + cast(contributions.amount as double) as amount + FROM donors INNER JOIN contributions + USING (donor_id) +) +SELECT name, sum(amount) AS totals +FROM donorscontributions +GROUP BY donor_id, name +ORDER BY totals DESC +LIMIT 10 +''' + +cur_dict = as_pandas(q).to_dict('records') + +print("Top Donors (raw)") +for row in cur_dict: + row['totals'] = locale.currency(row['totals'], grouping=True) + print('%(totals)20s: %(name)s' % row) + +# Close our database connection +# read_con.close() +# write_con.close() + +print('ran in', time.time() - start_time, 'seconds') + + +# In[9]: + + +get_ipython().system('jupyter nbconvert --to script athena_example.ipynb --output-dir=../athena_example/') + + +# In[ ]: + + + + diff --git a/athena_example/athena_init.py b/athena_example/athena_init.py new file mode 100644 index 00000000..9ddb14c8 --- /dev/null +++ b/athena_example/athena_init.py @@ -0,0 +1,215 @@ +#!/usr/bin/python +""" +This is a setup script for athena_example. It downloads a zip file of +Illinois campaign contributions and loads them into a Athena database +named 'contributions'. + +__Note:__ You will need to run this script first before execuing +[athena_example.py](athena_example.py). + +Tables created: +* raw_table - raw import of entire CSV file +* donors - all distinct donors based on name and address +* recipients - all distinct campaign contribution recipients +* contributions - contribution amounts tied to donor and recipients tables +""" + +import os +import zipfile +import warnings +import pandas as pd +import numpy as np +from urllib.request import urlopen +import boto3 +import config +import csv +import sys +sys.path.insert(0, '../athena_example/') +import utils + + +contributions_zip_file = 'Illinois-campaign-contributions.txt.zip' +contributions_txt_file = 'Illinois-campaign-contributions.txt' + +if not os.path.exists(contributions_zip_file) : + print('downloading', contributions_zip_file, '(~60mb) ...') + u = urlopen('https://s3.amazonaws.com/dedupe-data/Illinois-campaign-contributions.txt.zip') + localFile = open(contributions_zip_file, 'wb') + localFile.write(u.read()) + localFile.close() + +if not os.path.exists(contributions_txt_file) : + zip_file = zipfile.ZipFile(contributions_zip_file, 'r') + print('extracting %s' % contributions_zip_file) + zip_file_contents = zip_file.namelist() + for f in zip_file_contents: + if ('.txt' in f): + zip_file.extract(f) + zip_file.close() + + + + +print('importing raw data from csv...') +utils.athena_start_query("DROP TABLE IF EXISTS raw_table") +utils.athena_start_query("DROP TABLE IF EXISTS donors") +utils.athena_start_query("DROP TABLE IF EXISTS recipients") +utils.athena_start_query("DROP TABLE IF EXISTS contributions") +utils.athena_start_query("DROP TABLE IF EXISTS processed_donors") + + +q=r''' +CREATE EXTERNAL TABLE raw_table + (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), + address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), + state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), + date_recieved VARCHAR(10), loan_amount VARCHAR(12), + amount VARCHAR(23), receipt_type VARCHAR(23), + employer VARCHAR(70), occupation VARCHAR(40), + vendor_last_name VARCHAR(70), vendor_first_name VARCHAR(20), + vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), + vendor_city VARCHAR(20), vendor_state VARCHAR(10), + vendor_zip VARCHAR(10), description VARCHAR(90), + election_type VARCHAR(10), election_year VARCHAR(10), + report_period_begin VARCHAR(10), report_period_end VARCHAR(33), + committee_name VARCHAR(70), committee_id VARCHAR(37)) +ROW FORMAT DELIMITED + FIELDS TERMINATED BY '\t' + ESCAPED BY '\\' + LINES TERMINATED BY '\n' +LOCATION + 's3://{}/{}' +TBLPROPERTIES ( + 'classification'='csv', + 'skip.header.line.count'='1', + 'serialization.null.format'='') +'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'raw_table') +utils.athena_start_query(q) + + +df = pd.read_csv(contributions_txt_file, sep='\t', escapechar='\\', quoting=csv.QUOTE_NONE, + error_bad_lines=False, warn_bad_lines=True, dtype=str, keep_default_na=False, na_values=[''])#, + +# Remove the very few records that mess up the demo +# (demo purposes only! Don't do something like this in production) +df = df[df['RcvDate'].str.len()>=10] + +# set empty, non-zero, strings in date columns to null +df.loc[df['RptPdBegDate'].str.len()<10,'RptPdBegDate'] = np.nan + +df.loc[df['RptPdEndDate'].str.len()<10,'RptPdEndDate'] = np.nan + +#committee ID is requred. Remove the 2 rows that don't have it. +df = df[df['ID']!=''] + +# There's a record with a date stuck in the committee_id column, which causes +# problems when inserting into the contributions table below. Get rid of it this +# way. +df = df[df['ID'].str.len() <=9] + +# dropping the last columns +df = df.drop(columns='Unnamed: 29') + +# Nullifying empty strings +# df = df.replace(r'^\s*$', np.nan, regex=True) +df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand') + +utils.write(body=df_lower.to_csv(quoting=csv.QUOTE_NONE, sep="\t", escapechar='\\', index=None), + filename=os.path.join("s3://", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'raw_table', contributions_txt_file,)) + +# Athena is doesn't equate empty string and null, eventhough in the table spec we said so +# Not that it's a bug, it works if the string is null in the source, but not after applying trim to it +# So we need to manually take care of that +print('creating donors table...') +q=''' +CREATE TABLE donors as + with tmp as + (SELECT DISTINCT + NULLIF(TRIM(last_name), '') as last_name, + NULLIF(TRIM(first_name), '') as first_name, + NULLIF(TRIM(address_1), '') as address_1, + NULLIF(TRIM(address_2), '') as address_2, + NULLIF(TRIM(city), '') city, + NULLIF(TRIM(state), '') as state, + NULLIF(TRIM(zip), '') as zip, + NULLIF(TRIM(employer), '') as employer, + NULLIF(TRIM(occupation), '') as occupation + FROM raw_table) + SELECT row_number() over () as donor_id, * from tmp''' +utils.athena_start_query(q) + + +q=''' +CREATE TABLE recipients as + SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM raw_table +''' +utils.athena_start_query(q) + +print('creating contributions table') + +# -- +# c.execute("CREATE TABLE contributions " +# "(contribution_id INT, donor_id INT, recipient_id INT, " +# " report_type VARCHAR(24), date_recieved DATE, " +# " loan_amount VARCHAR(12), amount VARCHAR(23), " +# " receipt_type VARCHAR(23), " +# " vendor_last_name VARCHAR(70), " +# " vendor_first_name VARCHAR(20), " +# " vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), " +# " vendor_city VARCHAR(20), vendor_state VARCHAR(10), " +# " vendor_zip VARCHAR(10), description VARCHAR(90), " +# " election_type VARCHAR(10), election_year VARCHAR(10), " +# " report_period_begin DATE, report_period_end DATE) " +# "CHARACTER SET utf8 COLLATE utf8_unicode_ci") +# -- + +q=''' +CREATE TABLE contributions as + SELECT reciept_id as contribution_id, + donors.donor_id as donor_id , + committee_id as recipient_id, + report_type, date_parse(date_recieved, '%m/%d/%Y') as date_recieved, + loan_amount, amount, + receipt_type, vendor_last_name , + vendor_first_name, vendor_address_1, vendor_address_2, + vendor_city, vendor_state, vendor_zip, description, + election_type, election_year, + date_parse(report_period_begin, '%m/%d/%Y') as report_period_begin, + date_parse(report_period_end, '%m/%d/%Y') as report_period_end + FROM raw_table JOIN donors ON + coalesce(donors.first_name, '') = coalesce(TRIM(raw_table.first_name), '') AND + coalesce(donors.last_name, '') = coalesce(TRIM(raw_table.last_name), '') AND + coalesce(donors.address_1, '') = coalesce(TRIM(raw_table.address_1), '') AND + coalesce(donors.address_2, '') = coalesce(TRIM(raw_table.address_2), '') AND + coalesce(donors.city, '') = coalesce(TRIM(raw_table.city), '') AND + coalesce(donors.state, '') = coalesce(TRIM(raw_table.state), '') AND + coalesce(donors.employer, '') = coalesce(TRIM(raw_table.employer), '') AND + coalesce(donors.occupation , '')= coalesce(TRIM(raw_table.occupation), '') AND + coalesce(donors.zip, '') = coalesce(TRIM(raw_table.zip), '')''' + +utils.athena_start_query(q) + +q = ''' +CREATE TABLE processed_donors AS + SELECT donor_id, + LOWER(city) AS city, + CASE WHEN (first_name IS NULL AND last_name IS NULL) + THEN NULL + ELSE LOWER(array_join(filter(array[first_name, last_name], x-> x IS NOT NULL), ' ')) + END AS name, + LOWER(zip) AS zip, + LOWER(state) AS state, + CASE WHEN (address_1 IS NULL AND address_2 IS NULL) + THEN NULL + ELSE LOWER(array_join(filter(array[address_1, address_2], x-> x IS NOT NULL), ' ')) + END AS address, + LOWER(occupation) AS occupation, + LOWER(employer) AS employer, + first_name is null AS person + FROM donors''' +utils.athena_start_query(q) + + + + +print('done') diff --git a/athena_example/config.py b/athena_example/config.py new file mode 100644 index 00000000..60964c73 --- /dev/null +++ b/athena_example/config.py @@ -0,0 +1,12 @@ +LOG_FILE = 'log.txt' +# Connection parameters +ACCESS_KEY_ID = None +SECRET_ACCESS_KEY = None +ATHENA_GARBAGE_PATH = 's3://com.ria.scratch/athena_garbage/' +WORKGROUP = 'RIA' +REGION = 'eu-west-1' +DATABASE = 'ria_data_science_s3' + +# Database Parameters +DATABASE_BUCKET = 'com.ria.scratch' +DATABASE_ROOT_KEY = 'as-dedupe/' diff --git a/athena_example/mysql.cnf_LOCAL b/athena_example/mysql.cnf_LOCAL deleted file mode 100644 index 17bded3f..00000000 --- a/athena_example/mysql.cnf_LOCAL +++ /dev/null @@ -1,4 +0,0 @@ -[client] -user = your_mysql_user -password = your_mysql_password -default-character-set=utf8 diff --git a/athena_example/mysql_example.py b/athena_example/mysql_example.py deleted file mode 100644 index 5e257e13..00000000 --- a/athena_example/mysql_example.py +++ /dev/null @@ -1,344 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -""" -This is an example of working with very large data. There are about -700,000 unduplicated donors in this database of Illinois political -campaign contributions. - -With such a large set of input data, we cannot store all the comparisons -we need to make in memory. Instead, we will read the pairs on demand -from the MySQL database. - -__Note:__ You will need to run `python mysql_init_db.py` -before running this script. See the annotates source for -[mysql_init_db.py](mysql_init_db.html) - -For smaller datasets (<10,000), see our -[csv_example](csv_example.html) -""" - -import os -import itertools -import time -import logging -import optparse -import locale -import json - -import MySQLdb -import MySQLdb.cursors - -import dedupe -import dedupe.backport - - -def record_pairs(result_set): - for i, row in enumerate(result_set): - a_record_id, a_record, b_record_id, b_record = row - record_a = (a_record_id, json.loads(a_record)) - record_b = (b_record_id, json.loads(b_record)) - - yield record_a, record_b - - if i % 10000 == 0: - print(i) - - -def cluster_ids(clustered_dupes): - - for cluster, scores in clustered_dupes: - cluster_id = cluster[0] - for donor_id, score in zip(cluster, scores): - yield donor_id, cluster_id, score - - -if __name__ == '__main__': - - # ## Logging - - # Dedupe uses Python logging to show or suppress verbose output. Added - # for convenience. To enable verbose output, run `python - # examples/mysql_example/mysql_example.py -v` - optp = optparse.OptionParser() - optp.add_option('-v', '--verbose', dest='verbose', action='count', - help='Increase verbosity (specify multiple times for more)' - ) - (opts, args) = optp.parse_args() - log_level = logging.WARNING - if opts.verbose: - if opts.verbose == 1: - log_level = logging.INFO - elif opts.verbose >= 2: - log_level = logging.DEBUG - logging.getLogger().setLevel(log_level) - - # ## Setup - MYSQL_CNF = os.path.abspath('.') + '/mysql.cnf' - - settings_file = 'mysql_example_settings' - training_file = 'mysql_example_training.json' - - start_time = time.time() - - # You'll need to copy `examples/mysql_example/mysql.cnf_LOCAL` to - # `examples/mysql_example/mysql.cnf` and fill in your mysql database - # information in `examples/mysql_example/mysql.cnf` - - # We use Server Side cursors (SSDictCursor and SSCursor) to [avoid - # having to have enormous result sets in - # memory](http://stackoverflow.com/questions/1808150/how-to-efficiently-use-mysqldb-sscursor). - read_con = MySQLdb.connect(db='contributions', - charset='utf8', - read_default_file=MYSQL_CNF, - cursorclass=MySQLdb.cursors.SSDictCursor) - - write_con = MySQLdb.connect(db='contributions', - charset='utf8', - read_default_file=MYSQL_CNF) - - # We'll be using variations on this following select statement to pull - # in campaign donor info. - # - # We did a fair amount of preprocessing of the fields in - # `mysql_init_db.py` - - DONOR_SELECT = "SELECT donor_id, city, name, zip, state, address " \ - "from processed_donors" - - # ## Training - - if os.path.exists(settings_file): - print('reading from ', settings_file) - with open(settings_file, 'rb') as sf: - deduper = dedupe.StaticDedupe(sf, num_cores=4) - else: - # Define the fields dedupe will pay attention to - # - # The address, city, and zip fields are often missing, so we'll - # tell dedupe that, and we'll learn a model that take that into - # account - fields = [{'field': 'name', 'type': 'String'}, - {'field': 'address', 'type': 'String', - 'has missing': True}, - {'field': 'city', 'type': 'ShortString', 'has missing': True}, - {'field': 'state', 'type': 'ShortString', 'has missing': True}, - {'field': 'zip', 'type': 'ShortString', 'has missing': True}, - ] - - # Create a new deduper object and pass our data model to it. - deduper = dedupe.Dedupe(fields, num_cores=4) - - # We will sample pairs from the entire donor table for training - with read_con.cursor() as cur: - cur.execute(DONOR_SELECT) - temp_d = {i: row for i, row in enumerate(cur)} - - # If we have training data saved from a previous run of dedupe, - # look for it an load it in. - # - # __Note:__ if you want to train from - # scratch, delete the training_file - if os.path.exists(training_file): - print('reading labeled examples from ', training_file) - with open(training_file) as tf: - deduper.prepare_training(temp_d, training_file=tf) - else: - deduper.prepare_training(temp_d) - - del temp_d - - # ## Active learning - - print('starting active labeling...') - # Starts the training loop. Dedupe will find the next pair of records - # it is least certain about and ask you to label them as duplicates - # or not. - - # use 'y', 'n' and 'u' keys to flag duplicates - # press 'f' when you are finished - dedupe.convenience.console_label(deduper) - # When finished, save our labeled, training pairs to disk - with open(training_file, 'w') as tf: - deduper.write_training(tf) - - # Notice our the argument here - # - # `recall` is the proportion of true dupes pairs that the learned - # rules must cover. You may want to reduce this if your are making - # too many blocks and too many comparisons. - deduper.train(recall=0.90) - - with open(settings_file, 'wb') as sf: - deduper.write_settings(sf) - - # We can now remove some of the memory hobbing objects we used - # for training - deduper.cleanup_training() - - # ## Blocking - - print('blocking...') - - # To run blocking on such a large set of data, we create a separate table - # that contains blocking keys and record ids - print('creating blocking_map database') - with write_con.cursor() as cur: - cur.execute("DROP TABLE IF EXISTS blocking_map") - cur.execute("CREATE TABLE blocking_map " - "(block_key VARCHAR(200), donor_id INTEGER) " - "CHARACTER SET utf8 COLLATE utf8_unicode_ci") - - write_con.commit() - - # If dedupe learned a Index Predicate, we have to take a pass - # through the data and create indices. - print('creating inverted index') - - for field in deduper.fingerprinter.index_fields: - with read_con.cursor() as cur: - cur.execute("SELECT DISTINCT {field} FROM processed_donors " - "WHERE {field} IS NOT NULL".format(field=field)) - field_data = (row[0] for row in cur) - deduper.fingerprinter.index(field_data, field) - - # Now we are ready to write our blocking map table by creating a - # generator that yields unique `(block_key, donor_id)` tuples. - print('writing blocking map') - - with read_con.cursor() as read_cur: - read_cur.execute(DONOR_SELECT) - full_data = ((row['donor_id'], row) for row in read_cur) - b_data = deduper.fingerprinter(full_data) - - with write_con.cursor() as write_cur: - - write_cur.executemany("INSERT INTO blocking_map VALUES (%s, %s)", - b_data) - - write_con.commit() - - # Free up memory by removing indices we don't need anymore - deduper.fingerprinter.reset_indices() - - # indexing blocking_map - print('creating index') - with write_con.cursor() as cur: - cur.execute("CREATE UNIQUE INDEX bm_idx ON blocking_map (block_key, donor_id)") - - write_con.commit() - read_con.commit() - - # select unique pairs to compare - with read_con.cursor(MySQLdb.cursors.SSCursor) as read_cur: - - read_cur.execute(""" - select a.donor_id, - json_object('city', a.city, - 'name', a.name, - 'zip', a.zip, - 'state', a.state, - 'address', a.address), - b.donor_id, - json_object('city', b.city, - 'name', b.name, - 'zip', b.zip, - 'state', b.state, - 'address', b.address) - from (select DISTINCT l.donor_id as east, r.donor_id as west - from blocking_map as l - INNER JOIN blocking_map as r - using (block_key) - where l.donor_id < r.donor_id) ids - INNER JOIN processed_donors a on ids.east=a.donor_id - INNER JOIN processed_donors b on ids.west=b.donor_id - """) - - # ## Clustering - - print('clustering...') - clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)), - threshold=0.5) - - with write_con.cursor() as write_cur: - - # ## Writing out results - - # We now have a sequence of tuples of donor ids that dedupe believes - # all refer to the same entity. We write this out onto an entity map - # table - write_cur.execute("DROP TABLE IF EXISTS entity_map") - - print('creating entity_map database') - write_cur.execute("CREATE TABLE entity_map " - "(donor_id INTEGER, canon_id INTEGER, " - " cluster_score FLOAT, PRIMARY KEY(donor_id))") - - write_cur.executemany('INSERT INTO entity_map VALUES (%s, %s, %s)', - cluster_ids(clustered_dupes)) - - write_con.commit() - - with write_con.cursor() as cur: - cur.execute("CREATE INDEX head_index ON entity_map (canon_id)") - - write_con.commit() - read_con.commit() - - # Print out the number of duplicates found - print('# duplicate sets') - - # ## Payoff - - # With all this done, we can now begin to ask interesting questions - # of the data - # - # For example, let's see who the top 10 donors are. - - locale.setlocale(locale.LC_ALL, '') # for pretty printing numbers - - with read_con.cursor() as cur: - # Create a temporary table so each group and unmatched record has - # a unique id - cur.execute("CREATE TEMPORARY TABLE e_map " - "SELECT IFNULL(canon_id, donor_id) AS canon_id, donor_id " - "FROM entity_map " - "RIGHT JOIN donors USING(donor_id)") - - cur.execute("SELECT CONCAT_WS(' ', donors.first_name, donors.last_name) AS name, " - "donation_totals.totals AS totals " - "FROM donors INNER JOIN " - "(SELECT canon_id, SUM(amount) AS totals " - " FROM contributions INNER JOIN e_map " - " USING (donor_id) " - " GROUP BY (canon_id) " - " ORDER BY totals " - " DESC LIMIT 10) " - "AS donation_totals " - "WHERE donors.donor_id = donation_totals.canon_id") - - print("Top Donors (deduped)") - for row in cur: - row['totals'] = locale.currency(row['totals'], grouping=True) - print('%(totals)20s: %(name)s' % row) - - # Compare this to what we would have gotten if we hadn't done any - # deduplication - cur.execute("SELECT CONCAT_WS(' ', donors.first_name, donors.last_name) as name, " - "SUM(contributions.amount) AS totals " - "FROM donors INNER JOIN contributions " - "USING (donor_id) " - "GROUP BY (donor_id) " - "ORDER BY totals DESC " - "LIMIT 10") - - print("Top Donors (raw)") - for row in cur: - row['totals'] = locale.currency(row['totals'], grouping=True) - print('%(totals)20s: %(name)s' % row) - - # Close our database connection - read_con.close() - write_con.close() - - print('ran in', time.time() - start_time, 'seconds') diff --git a/athena_example/mysql_init_db.py b/athena_example/mysql_init_db.py deleted file mode 100644 index fcdc1256..00000000 --- a/athena_example/mysql_init_db.py +++ /dev/null @@ -1,234 +0,0 @@ -#!/usr/bin/python -""" -This is a setup script for mysql_example. It downloads a zip file of -Illinois campaign contributions and loads them into a MySQL database -named 'contributions'. - -__Note:__ You will need to run this script first before execuing -[mysql_example.py](mysql_example.html). - -Tables created: -* raw_table - raw import of entire CSV file -* donors - all distinct donors based on name and address -* recipients - all distinct campaign contribution recipients -* contributions - contribution amounts tied to donor and recipients tables -""" - -import os -import zipfile -import warnings - -from urllib.request import urlopen - -import MySQLdb - -warnings.filterwarnings('ignore', category=MySQLdb.Warning) - -contributions_zip_file = 'Illinois-campaign-contributions.txt.zip' -contributions_txt_file = 'Illinois-campaign-contributions.txt' - -if not os.path.exists(contributions_zip_file) : - print('downloading', contributions_zip_file, '(~60mb) ...') - u = urlopen('https://s3.amazonaws.com/dedupe-data/Illinois-campaign-contributions.txt.zip') - localFile = open(contributions_zip_file, 'wb') - localFile.write(u.read()) - localFile.close() - -if not os.path.exists(contributions_txt_file) : - zip_file = zipfile.ZipFile(contributions_zip_file, 'r') - print('extracting %s' % contributions_zip_file) - zip_file_contents = zip_file.namelist() - for f in zip_file_contents: - if ('.txt' in f): - zip_file.extract(f) - zip_file.close() - -conn = MySQLdb.connect(read_default_file = os.path.abspath('.') + '/mysql.cnf', - local_infile = 1, - sql_mode="ALLOW_INVALID_DATES", - db='contributions') -c = conn.cursor() - -print('importing raw data from csv...') -c.execute("DROP TABLE IF EXISTS raw_table") -c.execute("DROP TABLE IF EXISTS donors") -c.execute("DROP TABLE IF EXISTS recipients") -c.execute("DROP TABLE IF EXISTS contributions") -c.execute("DROP TABLE IF EXISTS processed_donors") - -c.execute("CREATE TABLE raw_table " - "(reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), " - " address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), " - " state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), " - " date_recieved VARCHAR(10), loan_amount VARCHAR(12), " - " amount VARCHAR(23), receipt_type VARCHAR(23), " - " employer VARCHAR(70), occupation VARCHAR(40), " - " vendor_last_name VARCHAR(70), vendor_first_name VARCHAR(20), " - " vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), " - " vendor_city VARCHAR(20), vendor_state VARCHAR(10), " - " vendor_zip VARCHAR(10), description VARCHAR(90), " - " election_type VARCHAR(10), election_year VARCHAR(10), " - " report_period_begin VARCHAR(10), report_period_end VARCHAR(33), " - " committee_name VARCHAR(70), committee_id VARCHAR(37)) " - "CHARACTER SET utf8 COLLATE utf8_unicode_ci") - - -conn.commit() - -c.execute("LOAD DATA LOCAL INFILE %s INTO TABLE raw_table " - "FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\r\n' " - "IGNORE 1 LINES " - "(reciept_id, last_name, first_name, " - " address_1, address_2, city, state, " - " zip, report_type, date_recieved, " - " loan_amount, amount, receipt_type, " - " employer, occupation, vendor_last_name, " - " vendor_first_name, vendor_address_1, " - " vendor_address_2, vendor_city, vendor_state, " - " vendor_zip, description, election_type, " - " election_year, " - " report_period_begin, report_period_end, " - " committee_name, committee_id, @dummy)", - (contributions_txt_file,)) - -# Remove the very few records that mess up the demo -# (demo purposes only! Don't do something like this in production) -c.execute("DELETE FROM raw_table WHERE LENGTH(date_recieved) < 10") - -# set empty, non-zero, strings in date columns to null -c.execute("UPDATE raw_table SET report_period_begin = NULL WHERE LENGTH(report_period_begin) < 10") -c.execute("UPDATE raw_table SET report_period_end = NULL WHERE LENGTH(report_period_end) < 10") - -#committee ID is requred. Remove the 2 rows that don't have it. -c.execute("DELETE FROM raw_table WHERE committee_id=''"); - -# There's a record with a date stuck in the committee_id column, which causes -# problems when inserting into the contributions table below. Get rid of it this -# way. -c.execute("DELETE FROM raw_table WHERE LENGTH( committee_id ) > 9") -conn.commit() - - - -print('creating donors table...') -c.execute("CREATE TABLE donors " - "(donor_id INTEGER PRIMARY KEY AUTO_INCREMENT, " - " last_name VARCHAR(70), first_name VARCHAR(35), " - " address_1 VARCHAR(35), address_2 VARCHAR(36), " - " city VARCHAR(20), state VARCHAR(15), " - " zip VARCHAR(11), employer VARCHAR(70), " - " occupation VARCHAR(40)) " - "CHARACTER SET utf8 COLLATE utf8_unicode_ci") -c.execute("INSERT INTO donors " - "(first_name, last_name, address_1," - " address_2, city, state, zip, employer, occupation) " - "SELECT DISTINCT " - "TRIM(first_name), TRIM(last_name), TRIM(address_1), " - "TRIM(address_2), TRIM(city), TRIM(state), TRIM(zip), " - "TRIM(employer), TRIM(occupation) " - "FROM raw_table") -conn.commit() - - -print('creating indexes on donors table') -c.execute("CREATE INDEX donors_donor_info ON donors " - "(last_name, first_name, address_1, address_2, city, " - " state, zip)") -conn.commit() - - - -print('creating recipients table...') -c.execute("CREATE TABLE recipients " - "(recipient_id INTEGER PRIMARY KEY AUTO_INCREMENT, name VARCHAR(70)) " - "CHARACTER SET utf8 COLLATE utf8_unicode_ci") - -c.execute("INSERT IGNORE INTO recipients " - "SELECT DISTINCT committee_id, committee_name FROM raw_table") -conn.commit() - -print('creating contributions table') -c.execute("CREATE TABLE contributions " - "(contribution_id INT, donor_id INT, recipient_id INT, " - " report_type VARCHAR(24), date_recieved DATE, " - " loan_amount VARCHAR(12), amount VARCHAR(23), " - " receipt_type VARCHAR(23), " - " vendor_last_name VARCHAR(70), " - " vendor_first_name VARCHAR(20), " - " vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), " - " vendor_city VARCHAR(20), vendor_state VARCHAR(10), " - " vendor_zip VARCHAR(10), description VARCHAR(90), " - " election_type VARCHAR(10), election_year VARCHAR(10), " - " report_period_begin DATE, report_period_end DATE) " - "CHARACTER SET utf8 COLLATE utf8_unicode_ci") - - -c.execute("INSERT INTO contributions " - "SELECT reciept_id, donors.donor_id, committee_id, " - " report_type, STR_TO_DATE(date_recieved, '%m/%d/%Y'), " - " loan_amount, amount, " - " receipt_type, vendor_last_name , " - " vendor_first_name, vendor_address_1, vendor_address_2, " - " vendor_city, vendor_state, vendor_zip, description, " - " election_type, election_year, " - " STR_TO_DATE(report_period_begin, '%m/%d/%Y'), " - " STR_TO_DATE(report_period_end, '%m/%d/%Y') " - "FROM raw_table JOIN donors ON " - "donors.first_name = TRIM(raw_table.first_name) AND " - "donors.last_name = TRIM(raw_table.last_name) AND " - "donors.address_1 = TRIM(raw_table.address_1) AND " - "donors.address_2 = TRIM(raw_table.address_2) AND " - "donors.city = TRIM(raw_table.city) AND " - "donors.state = TRIM(raw_table.state) AND " - "donors.employer = TRIM(raw_table.employer) AND " - "donors.occupation = TRIM(raw_table.occupation) AND " - "donors.zip = TRIM(raw_table.zip)") -conn.commit() - -print('creating indexes on contributions') -c.execute("ALTER TABLE contributions ADD PRIMARY KEY(contribution_id)") -c.execute("CREATE INDEX donor_idx ON contributions (donor_id)") -c.execute("CREATE INDEX recipient_idx ON contributions (recipient_id)") - - -conn.commit() - -print('nullifying empty strings in donors') -c.execute("UPDATE donors " - "SET " - "first_name = CASE first_name WHEN '' THEN NULL ELSE first_name END, " - "last_name = CASE last_name WHEN '' THEN NULL ELSE last_name END, " - "address_1 = CASE address_1 WHEN '' THEN NULL ELSE address_1 END, " - "address_2 = CASE address_2 WHEN '' THEN NULL ELSE address_2 END, " - "city = CASE city WHEN '' THEN NULL ELSE city END, " - "state = CASE state WHEN '' THEN NULL ELSE state END, " - "employer = CASE employer WHEN '' THEN NULL ELSE employer END, " - "occupation = CASE occupation WHEN '' THEN NULL ELSE occupation END, " - "zip = CASE zip WHEN '' THEN NULL ELSE zip END") - - -conn.commit() - -c.execute("CREATE TABLE processed_donors AS " - "(SELECT donor_id, " - " LOWER(city) AS city, " - " CASE WHEN (first_name IS NULL AND last_name IS NULL) " - " THEN NULL " - " ELSE LOWER(CONCAT_WS(' ', first_name, last_name)) " - " END AS name, " - " LOWER(zip) AS zip, " - " LOWER(state) AS state, " - " CASE WHEN (address_1 IS NULL AND address_2 IS NULL) " - " THEN NULL " - " ELSE LOWER(CONCAT_WS(' ', address_1, address_2)) " - " END AS address, " - " LOWER(occupation) AS occupation, " - " LOWER(employer) AS employer, " - " ISNULL(first_name) AS person " - " FROM donors)") - -c.execute("CREATE INDEX donor_idx ON processed_donors (donor_id)") - -c.close() -conn.close() -print('done') diff --git a/athena_example/utils.py b/athena_example/utils.py new file mode 100644 index 00000000..77f18fda --- /dev/null +++ b/athena_example/utils.py @@ -0,0 +1,138 @@ +from __future__ import print_function +import re +import boto3 +import botocore +import sys +import datetime +import os +import time +import pandas as pd +from six import string_types +import sys +pyver = sys.version_info[0] + +if pyver<3: + from StringIO import StringIO as SomethingIO + from urlparse import urlparse +else: + from io import BytesIO as SomethingIO + from urllib.parse import urlparse + +sys.path.insert(0, '../athena_example/') +import config + +s3 = boto3.client('s3', region_name=config.REGION, + aws_access_key_id=config.ACCESS_KEY_ID, aws_secret_access_key=config.SECRET_ACCESS_KEY) + +athena = boto3.client('athena', region_name=config.REGION, + aws_access_key_id=config.ACCESS_KEY_ID, aws_secret_access_key=config.SECRET_ACCESS_KEY) + +def athena_to_panda(query, database=config.DATABASE, output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, **kwargs): + query_execution_id = athena_start_query(query, database, output_location, region, workgroup, wait_until_finished=True) + df = pandas_read_csv(os.path.join(output_location, query_execution_id+'.csv'), **kwargs) + return df + + +def athena_start_query(query, database=config.DATABASE, output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, wait_until_finished=True): + query_execution_id = athena.start_query_execution( + QueryString=query, + QueryExecutionContext={ + 'Database': database + }, + WorkGroup=workgroup, + ResultConfiguration={ + "OutputLocation": output_location + } + )['QueryExecutionId'] + + seconds_to_wait = 1 + + if wait_until_finished: + while True: + time.sleep(seconds_to_wait) + seconds_to_wait += 1 +# seconds_to_wait *= 2 + + execution = athena.get_query_execution( + QueryExecutionId=query_execution_id + ) + + if execution['QueryExecution']['Status']['State'] not in ['QUEUED', 'RUNNING']: + break + + if execution['QueryExecution']['Status']['State'] != 'SUCCEEDED': + raise Exception("Athena query failed: %s" % ( execution['QueryExecution']['Status']['StateChangeReason'],), query_execution_id) + + return query_execution_id + +# Copied from https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py +# Import it instead, when it's updated. +def is_s3_url(url): + """Check for an s3, s3n, or s3a url""" + try: + return urlparse(url).scheme in ["s3", "s3n", "s3a"] + except Exception: + return False +def seperate_bucket_key(url): + m = re.match('s3://([^/]+)/(.*)', url) + return m.group(1), m.group(2) + +def list_all(path): + if is_s3_url(path): + bucket, key = seperate_bucket_key(path) + objects = s3.list_objects_v2(Bucket=bucket, Prefix=key) + return [key['Key'] for key in objects['Contents']] + from os import listdir + from os.path import isfile, join + return listdir(path) + + +def pandas_read_csv(filepath_or_buffer, verbose=True, **kwargs): + bucket, key = seperate_bucket_key(filepath_or_buffer) + obj = s3.get_object(Bucket=bucket, Key=key) + return pd.read_csv(SomethingIO(obj['Body'].read()), **kwargs) + +def read(filename, verbose=True): + log ("Reading {}".format(filename), verbose=verbose) + if is_s3_url(filename): + bucket, key = seperate_bucket_key(filename) + obj=s3.get_object(Bucket=bucket, Key=key) + return obj['Body'].read() + with open (filename) as f: + return f.read() + +def write(body, filename): + bucket, key = seperate_bucket_key(filename) + s3.put_object(Bucket=bucket, Key=key, Body=body) + return + + +def file_exists(filename): + bucket, key = seperate_bucket_key(filename) + try: + s3.get_object(Bucket=bucket, Key=key) + except botocore.exceptions.ClientError as e: + if e.response['Error']['Code']=='NoSuchKey': + return False + else: + # Something else has gone wrong. + raise + else: + return True + + +def log(outstr, logfile_name=config.LOG_FILE, timestamped=True, verbose=True, quiet=False): + if verbose == False: + return + if timestamped: + outstr = "[%s]\t%s\n" % (str(datetime.datetime.now()) , outstr) + else: + outstr = "%s\n" % (outstr,) + + with open(logfile_name, "a") as logfile: + logfile.write(outstr) + + if not quiet: + sys.stdout.write(outstr); + sys.stdout.flush() +# Print iterations progress diff --git a/notebooks/athena_example.ipynb b/notebooks/athena_example.ipynb index 01c42392..ab222233 100644 --- a/notebooks/athena_example.ipynb +++ b/notebooks/athena_example.ipynb @@ -2,134 +2,9 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting dedupe\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/5e/09/179feb316147279c76ea7e6dc5a5f9e00a6feadaeda131d535247e580619/dedupe-2.0.3-cp36-cp36m-manylinux1_x86_64.whl (89kB)\n", - "\u001b[K 100% |████████████████████████████████| 92kB 239kB/s ta 0:00:011\n", - "\u001b[?25hCollecting pyathena\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/40/85/f37c049922f5d47e9126d7817ef7b8fb7abb2e6a9ea0dd06adcbffc0e8bc/PyAthena-1.10.8-py2.py3-none-any.whl (53kB)\n", - "\u001b[K 100% |████████████████████████████████| 61kB 1.9MB/s ta 0:00:011\n", - "\u001b[?25hCollecting haversine>=0.4.1 (from dedupe)\n", - " Downloading https://files.pythonhosted.org/packages/72/8e/6df8b563dd6b2961a36cd740b34c00b89142f1b97d92092c133379b2973f/haversine-2.2.0-py2.py3-none-any.whl\n", - "Collecting simplecosine>=1.2 (from dedupe)\n", - " Downloading https://files.pythonhosted.org/packages/2d/22/6ea3a5ab8aea06d6563eb927e706f7342a00d1849c9be6143a2a7d84ddbd/simplecosine-1.2-py2.py3-none-any.whl\n", - "Collecting rlr>=2.4.3 (from dedupe)\n", - " Downloading https://files.pythonhosted.org/packages/fa/02/3b1a9727a622ff4320919645ce35ceb887d90784d0bab41484756c33b7ea/rlr-2.4.5-py2.py3-none-any.whl\n", - "Collecting categorical-distance>=1.9 (from dedupe)\n", - " Downloading https://files.pythonhosted.org/packages/1d/b7/4f97771f52c63916f4e4d349a644c2387961592e76070e7310463b2d70a5/categorical_distance-1.9-py3-none-any.whl\n", - "Requirement already satisfied: numpy>=1.13 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.14.3)\n", - "Collecting fastcluster (from dedupe)\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/1e/9d/3d7525a4722ee4a11ad969762d1de53b6dac326b5ac1366221e06958e1d7/fastcluster-1.1.26-cp36-cp36m-manylinux1_x86_64.whl (154kB)\n", - "\u001b[K 100% |████████████████████████████████| 163kB 707kB/s ta 0:00:01\n", - "\u001b[?25hCollecting highered>=0.2.0 (from dedupe)\n", - " Downloading https://files.pythonhosted.org/packages/81/00/cbd902cfd14ad1992fcdaa11a615d47b36b6136dc690e19b0afa58c7365d/highered-0.2.1-py2.py3-none-any.whl\n", - "Collecting dedupe-hcluster (from dedupe)\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/b2/1f/c6f6075c2e988b3a1759fabaf91d2f8f2de59c6e607a3fd9a2e06112a0de/dedupe_hcluster-0.3.8-cp36-cp36m-manylinux1_x86_64.whl (531kB)\n", - "\u001b[K 100% |████████████████████████████████| 532kB 5.2MB/s ta 0:00:01\n", - "\u001b[?25hCollecting BTrees>=4.1.4 (from dedupe)\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/48/b3/9ce3b32817db98e8bf20d6873e18ee3ee7feded135434d800b72bf8dfb9f/BTrees-4.7.2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)\n", - "\u001b[K 100% |████████████████████████████████| 3.0MB 8.2MB/s eta 0:00:01\n", - "\u001b[?25hCollecting Levenshtein-search (from dedupe)\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/93/89/dc320196d10447540c95f58eab5dd316a2166310356c1d88b84724f4e793/Levenshtein_search-1.4.5-cp36-cp36m-manylinux1_x86_64.whl (59kB)\n", - "\u001b[K 100% |████████████████████████████████| 61kB 21.2MB/s ta 0:00:01\n", - "\u001b[?25hCollecting zope.index (from dedupe)\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ab/0f/f93bddfac1189bb6b973142da3ef2caa6817a59b07ca448095a30b644737/zope.index-5.0.0-cp36-cp36m-manylinux1_x86_64.whl (101kB)\n", - "\u001b[K 100% |████████████████████████████████| 102kB 17.6MB/s a 0:00:01\n", - "\u001b[?25hCollecting typing-extensions (from dedupe)\n", - " Downloading https://files.pythonhosted.org/packages/0c/0e/3f026d0645d699e7320b59952146d56ad7c374e9cd72cd16e7c74e657a0f/typing_extensions-3.7.4.2-py3-none-any.whl\n", - "Collecting affinegap>=1.3 (from dedupe)\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/b2/6a/91f5defe8178104449bc897208c9780b159575d16a959a5074f0bf39a6f0/affinegap-1.11-cp36-cp36m-manylinux1_x86_64.whl (45kB)\n", - "\u001b[K 100% |████████████████████████████████| 51kB 12.0MB/s ta 0:00:01\n", - "\u001b[?25hCollecting doublemetaphone (from dedupe)\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/c0/27/8df369334aac64755ca899b9a7cc4d2d60e800cca148322ef19309cdae0f/DoubleMetaphone-0.1-cp36-cp36m-manylinux1_x86_64.whl (78kB)\n", - "\u001b[K 100% |████████████████████████████████| 81kB 3.4MB/s eta 0:00:01\n", - "\u001b[?25hCollecting dedupe-variable-datetime (from dedupe)\n", - " Downloading https://files.pythonhosted.org/packages/65/8f/d21f6acadcdfd681ee038153883b5673b8b76f790e465d791780e6b7bf60/dedupe_variable_datetime-0.1.5-py3-none-any.whl\n", - "Collecting tenacity>=4.1.0 (from pyathena)\n", - " Downloading https://files.pythonhosted.org/packages/b5/05/ff089032442058bd3386f9cd991cd88ccac81dca1494d78751621ee35e62/tenacity-6.2.0-py2.py3-none-any.whl\n", - "Requirement already satisfied: botocore>=1.5.52 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pyathena) (1.15.39)\n", - "Collecting future (from pyathena)\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/45/0b/38b06fd9b92dc2b68d58b75f900e97884c45bedd2ff83203d933cf5851c9/future-0.18.2.tar.gz (829kB)\n", - "\u001b[K 100% |████████████████████████████████| 829kB 14.2MB/s ta 0:00:01\n", - "\u001b[?25hRequirement already satisfied: boto3>=1.4.4 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pyathena) (1.12.39)\n", - "Collecting pylbfgs (from rlr>=2.4.3->dedupe)\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/b8/5b/b8e1ef62e5e5b034ce5ae919b64158ec8da4f64c995444aec7fd96e8ec42/PyLBFGS-0.2.0.13-cp36-cp36m-manylinux1_x86_64.whl (205kB)\n", - "\u001b[K 100% |████████████████████████████████| 215kB 16.4MB/s ta 0:00:01\n", - "\u001b[?25hCollecting pyhacrf-datamade>=0.2.0 (from highered>=0.2.0->dedupe)\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/84/f5/971e17a8b6686d5fc3d562e29e9c902743eb5f0f4436880b86cb11c0149c/pyhacrf_datamade-0.2.5-cp36-cp36m-manylinux1_x86_64.whl (788kB)\n", - "\u001b[K 100% |████████████████████████████████| 798kB 14.5MB/s ta 0:00:01\n", - "\u001b[?25hCollecting zope.interface (from BTrees>=4.1.4->dedupe)\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/fc/7e/8e1efcfa22b722a0d6e992172ab15a871988c290cb722fe8da6d11f1aeb2/zope.interface-5.1.0-cp36-cp36m-manylinux1_x86_64.whl (234kB)\n", - "\u001b[K 100% |████████████████████████████████| 235kB 16.6MB/s ta 0:00:01\n", - "\u001b[?25hCollecting persistent>=4.1.0 (from BTrees>=4.1.4->dedupe)\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/2e/4e/9bde9a2f63273f2e63a94a8198781aac559cc6efd2f560d69afcb0d9d8b5/persistent-4.6.4-cp36-cp36m-manylinux1_x86_64.whl (246kB)\n", - "\u001b[K 100% |████████████████████████████████| 256kB 17.5MB/s ta 0:00:01\n", - "\u001b[?25hRequirement already satisfied: six in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from zope.index->dedupe) (1.11.0)\n", - "Requirement already satisfied: setuptools in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from zope.index->dedupe) (39.1.0)\n", - "Collecting datetime-distance (from dedupe-variable-datetime->dedupe)\n", - " Downloading https://files.pythonhosted.org/packages/6b/98/a5eff9256ff27e3bb8030466dabd772002e5014b9237cbeb18c542050ff5/datetime_distance-0.1.3-py3-none-any.whl\n", - "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from botocore>=1.5.52->pyathena) (2.7.3)\n", - "Requirement already satisfied: docutils<0.16,>=0.10 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from botocore>=1.5.52->pyathena) (0.14)\n", - "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from botocore>=1.5.52->pyathena) (0.9.4)\n", - "Requirement already satisfied: urllib3<1.26,>=1.20; python_version != \"3.4\" in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from botocore>=1.5.52->pyathena) (1.23)\n", - "Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from boto3>=1.4.4->pyathena) (0.3.3)\n", - "Requirement already satisfied: cffi; platform_python_implementation == \"CPython\" in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from persistent>=4.1.0->BTrees>=4.1.4->dedupe) (1.11.5)\n", - "Requirement already satisfied: pycparser in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from cffi; platform_python_implementation == \"CPython\"->persistent>=4.1.0->BTrees>=4.1.4->dedupe) (2.18)\n", - "Building wheels for collected packages: future\n", - " Running setup.py bdist_wheel for future ... \u001b[?25ldone\n", - "\u001b[?25h Stored in directory: /home/ec2-user/.cache/pip/wheels/8b/99/a0/81daf51dcd359a9377b110a8a886b3895921802d2fc1b2397e\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Successfully built future\n", - "Installing collected packages: haversine, simplecosine, future, pylbfgs, rlr, categorical-distance, fastcluster, pyhacrf-datamade, highered, dedupe-hcluster, zope.interface, persistent, BTrees, Levenshtein-search, zope.index, typing-extensions, affinegap, doublemetaphone, datetime-distance, dedupe-variable-datetime, dedupe, tenacity, pyathena\n", - "Successfully installed BTrees-4.7.2 Levenshtein-search-1.4.5 affinegap-1.11 categorical-distance-1.9 datetime-distance-0.1.3 dedupe-2.0.3 dedupe-hcluster-0.3.8 dedupe-variable-datetime-0.1.5 doublemetaphone-0.1 fastcluster-1.1.26 future-0.18.2 haversine-2.2.0 highered-0.2.1 persistent-4.6.4 pyathena-1.10.8 pyhacrf-datamade-0.2.5 pylbfgs-0.2.0.13 rlr-2.4.5 simplecosine-1.2 tenacity-6.2.0 typing-extensions-3.7.4.2 zope.index-5.0.0 zope.interface-5.1.0\n", - "\u001b[33mYou are using pip version 10.0.1, however version 20.2b1 is available.\n", - "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n" - ] - } - ], - "source": [ - "!pip install dedupe pyathena" - ] - }, - { - "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "import sys\n", - "sys.path.insert(0, '../athena_example/')\n", - "import config\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "ename": "AttributeError", - "evalue": "module 'logging' has no attribute 'logging'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 85\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 86\u001b[0m \u001b[0;31m## Armin\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 87\u001b[0;31m \u001b[0mlog_level\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlogging\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogging\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDEBUG\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 88\u001b[0m \u001b[0;31m#######\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 89\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mAttributeError\u001b[0m: module 'logging' has no attribute 'logging'" - ] - } - ], "source": [ "# %load ../mysql_example/mysql_example.py\n", "#!/usr/bin/python\n", @@ -151,6 +26,12 @@ "[csv_example](csv_example.html)\n", "\"\"\"\n", "\n", + "# There is a little bit difference between the result \n", + "# of this module and the mysql one. The reason is due to\n", + "# Some special (and mostly erroneous) characters, such as \\a .. \n", + "# Which are dealt with differently by mysql and athena/panda\n", + "\n", + "import sys\n", "import os\n", "import itertools\n", "import time\n", @@ -162,19 +43,17 @@ "import csv\n", "import pandas as pd\n", "\n", - "# import MySQLdb\n", - "# import MySQLdb.cursors\n", - "\n", "import boto3\n", - "from pyathena import connect\n", - "from pyathena.pandas_cursor import PandasCursor\n", "import dedupe\n", "import dedupe.backport\n", + "sys.path.insert(0, '../athena_example/')\n", + "import config\n", + "sys.path.insert(0, '../athena_example/')\n", + "import utils\n", "\n", - "def dict_cursor_execute(cur, query):\n", - " df = cur.execute(query).as_pandas()\n", - " return df.where(pd.notnull(df), None).astype(str)\n", - "\n", + "def as_pandas(query, **kwrgs):\n", + " df = utils.athena_to_panda(query, escapechar=None, keep_default_na=False, na_values=[''], **kwrgs)\n", + " return df.where(pd.notnull(df), None)\n", "\n", "def record_pairs(result_set):\n", " for i, row in enumerate(result_set):\n", @@ -204,67 +83,28 @@ " # for convenience. To enable verbose output, run `python\n", " # examples/mysql_example/mysql_example.py -v`\n", " \n", - "# optp = optparse.OptionParser()\n", - "# optp.add_option('-v', '--verbose', dest='verbose', action='count',\n", - "# help='Increase verbosity (specify multiple times for more)'\n", - "# )\n", - "# (opts, args) = optp.parse_args()\n", - "# log_level = logging.WARNING\n", - "# if opts.verbose:\n", - "# if opts.verbose == 1:\n", - "# log_level = logging.INFO\n", - "# elif opts.verbose >= 2:\n", - "# log_level = logging.DEBUG\n", - "\n", - "## Armin\n", - " log_level = logging.DEBUG\n", - "#######\n", + " optp = optparse.OptionParser()\n", + " optp.add_option('-v', '--verbose', dest='verbose', action='count',\n", + " help='Increase verbosity (specify multiple times for more)'\n", + " )\n", + " (opts, args) = optp.parse_args()\n", + " log_level = logging.WARNING\n", + " if opts.verbose:\n", + " if opts.verbose == 1:\n", + " log_level = logging.INFO\n", + " elif opts.verbose >= 2:\n", + " log_level = logging.DEBUG\n", + "\n", "\n", " logging.getLogger().setLevel(log_level)\n", "\n", " \n", "\n", - "# # ## Setup\n", - "# MYSQL_CNF = os.path.abspath('.') + '/mysql.cnf'\n", "\n", " settings_file = 'mysql_example_settings'\n", " training_file = 'mysql_example_training.json'\n", "\n", - " start_time = time.time()\n", - "\n", - " # You'll need to copy `examples/mysql_example/mysql.cnf_LOCAL` to\n", - " # `examples/mysql_example/mysql.cnf` and fill in your mysql database\n", - " # information in `examples/mysql_example/mysql.cnf`\n", - "\n", - " # We use Server Side cursors (SSDictCursor and SSCursor) to [avoid\n", - " # having to have enormous result sets in\n", - " # memory](http://stackoverflow.com/questions/1808150/how-to-efficiently-use-mysqldb-sscursor).\n", - "# read_con = MySQLdb.connect(db='contributions',\n", - "# charset='utf8',\n", - "# read_default_file=MYSQL_CNF,\n", - "# cursorclass=MySQLdb.cursors.SSDictCursor)\n", - "\n", - "# write_con = MySQLdb.connect(db='contributions',\n", - "# charset='utf8',\n", - "# read_default_file=MYSQL_CNF)\n", - "\n", - " s3 = boto3.client('s3') \n", - " conn = connect(aws_access_key_id=config.ACCESS_KEY_ID,\n", - " aws_secret_access_key=config.SECRET_ACCESS_KEY,\n", - " s3_staging_dir=config.ATHENA_GARBAGE_PATH,\n", - " region_name=config.REGION, \n", - " work_group=config.WORKGROUP) \n", - " cur = conn.cursor(PandasCursor, schema_name=config.SCHEMA_NAME)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# !rm 'mysql_example_settings'\n", - "# !rm 'mysql_example_training.json'" + " start_time = time.time()" ] }, { @@ -306,17 +146,13 @@ "\n", " # We will sample pairs from the entire donor table for training\n", "# with read_con.cursor() as cur:\n", - "# cur.execute(DONOR_SELECT)\n", - "# temp_d = {i: row for i, row in enumerate(cur)}\n", "\n", " # Armin: The problem is the donor_id, it's numpy's int64, should be converted to int! \n", - " # But for that, astype doesn't work, and a loot on temp_d is slow, so for now let's just use str\n", - " with conn.cursor(PandasCursor, schema_name=schema_name) as cursor:\n", - " # Something like this is much faster, but let's keep the changes minimal for now\n", - " # df = cur.execute(DONOR_SELECT).as_pandas().astype(str)\n", - " # temp_d = df.where(pd.notnull(df), None).to_dict('index')\n", - " cursor_df = dict_cursor_execute(cursor, DONOR_SELECT)\n", - " temp_d = cursor_df.to_dict('index')\n", + " # But for that, astype doesn't work, and a loop on temp_d is slow, so for now let's just use str\n", + "# with conn.cursor(PandasCursor, schema_name=schema_name) as cursor:\n", + " temp_df = as_pandas(DONOR_SELECT)\n", + " temp_d = temp_df.to_dict('index')\n", + " \n", "\n", " # If we have training data saved from a previous run of dedupe,\n", " # look for it an load it in.\n", @@ -374,14 +210,7 @@ " # To run blocking on such a large set of data, we create a separate table\n", " # that contains blocking keys and record ids\n", " print('creating blocking_map database')\n", - "# with write_con.cursor() as cur:\n", - "# cur.execute(\"DROP TABLE IF EXISTS blocking_map\")\n", - "# cur.execute(\"CREATE TABLE blocking_map \"\n", - "# \"(block_key VARCHAR(200), donor_id INTEGER) \"\n", - "# \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n", - "\n", - "# write_con.commit()\n", - " cur.execute(\"DROP TABLE IF EXISTS blocking_map\")\n", + " utils.athena_start_query(\"DROP TABLE IF EXISTS blocking_map\")\n", "\n", " q='''\n", " CREATE EXTERNAL TABLE blocking_map \n", @@ -396,7 +225,7 @@ " --'skip.header.line.count'='1', \n", " 'serialization.null.format'='')\n", " '''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'blocking_map') \n", - " cur.execute(q)" + " utils.athena_start_query(q)" ] }, { @@ -409,12 +238,14 @@ " # through the data and create indices.\n", " print('creating inverted index')\n", "\n", + " # Armin: \n", + " # This never runs, index_fields is empty, possible bug?\n", " for field in deduper.fingerprinter.index_fields:\n", " q = '''\n", " SELECT DISTINCT {field} FROM processed_donors \n", " WHERE {field} IS NOT NULL\n", " '''.format(field=field)\n", - " cur_df = dict_cursor_execute(cur, q)\n", + " cur_df = as_pandas(q)\n", " # Do I need to cast it as a list?\n", " field_data = cur_df[field]\n", " deduper.fingerprinter.index(field_data, field)\n", @@ -432,17 +263,10 @@ " print('writing blocking map')\n", " \n", "\n", - " read_cur_dict = dict_cursor_execute(cur, DONOR_SELECT).to_dict('records')\n", + " read_cur_dict = as_pandas(DONOR_SELECT).to_dict('records')\n", " full_data = ((row['donor_id'], row) for row in read_cur_dict)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": null, @@ -450,16 +274,7 @@ "outputs": [], "source": [ " b_data = deduper.fingerprinter(full_data)\n", - " buffer = pd.DataFrame.from_records(b_data).to_csv(index=False, header=False, sep='\\t')\n", - "# csv_out.writerows(b_data) \n", - "\n", - "# \"\\n\".join(b_data)\n", - "# with write_con.cursor() as write_cur:\n", - "\n", - "# write_cur.executemany(\"INSERT INTO blocking_map VALUES (%s, %s)\",\n", - "# b_data)\n", - " s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'blocking_map/blocking.csv', Body=buffer) \n", - "# write_con.commit()" + " buffer = pd.DataFrame.from_records(b_data).to_csv(index=False, header=False, sep='\\t') utils.s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'blocking_map/blocking.csv', Body=buffer) \n" ] }, { @@ -468,13 +283,6 @@ "metadata": {}, "outputs": [], "source": [ - " # indexing blocking_map\n", - "# print('creating index')\n", - "# with write_con.cursor() as cur:\n", - "# cur.execute(\"CREATE UNIQUE INDEX bm_idx ON blocking_map (block_key, donor_id)\")\n", - "\n", - "# write_con.commit()\n", - "# read_con.commit()\n", "\n", " # select unique pairs to compare\n", " q='''\n", @@ -494,7 +302,7 @@ " INNER JOIN processed_donors a on ids.east=a.donor_id\n", " INNER JOIN processed_donors b on ids.west=b.donor_id\n", " '''\n", - " read_cur_dict=dict_cursor_execute(cur, q).itertuples(index=False, name=False)" + " read_cur_dict=as_pandas(q).itertuples(index=False, name=None)" ] }, { @@ -507,7 +315,7 @@ "\n", " print('clustering...')\n", " clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur_dict)),\n", - " threshold=0.5)" + " threshold=0.5)" ] }, { @@ -516,7 +324,7 @@ "metadata": {}, "outputs": [], "source": [ - " cur.execute(\"DROP TABLE IF EXISTS entity_map\")\n", + " utils.athena_start_query(\"DROP TABLE IF EXISTS entity_map\")\n", "\n", " print('creating entity_map database')\n", " q='''\n", @@ -532,12 +340,11 @@ " 'classification'='csv', \n", " --'skip.header.line.count'='1', \n", " 'serialization.null.format'='')\n", - " '''.format(bucket, root_key+'entity_map') \n", - " cur.execute(q) \n", + " '''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'entity_map') \n", + " utils.athena_start_query(q) \n", "\n", " buffer = pd.DataFrame.from_records(cluster_ids(clustered_dupes)).to_csv(index=False, header=False, sep='\\t')\n", - " s3.put_object(Bucket=bucket, Key=root_key+'entity_map/entity_map.csv', Body=buffer) \n", - "\n" + " utils.s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'entity_map/entity_map.csv', Body=buffer) \n" ] }, { @@ -556,10 +363,9 @@ " #\n", " # For example, let's see who the top 10 donors are.\n", "\n", - " locale.setlocale(locale.LC_ALL, '') # for pretty printing numbers\n", + " locale.setlocale(locale.LC_ALL, 'en_CA.UTF-8') # for pretty printing numbers\n", " \n", - " cur.execute(\"DROP TABLE IF EXISTS e_map\")\n", - "\n", + " utils.athena_start_query(\"DROP TABLE IF EXISTS e_map\")\n", " q = '''\n", " CREATE TABLE e_map as \n", " SELECT COALESCE(canon_id, entity_map.donor_id) AS canon_id, entity_map.donor_id \n", @@ -567,7 +373,7 @@ " RIGHT JOIN donors USING(donor_id)\n", " '''\n", " \n", - " cur.execute(q)\n", + " utils.athena_start_query(q)\n", " q ='''\n", " SELECT array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name, \n", " donation_totals.totals AS totals \n", @@ -580,38 +386,38 @@ " DESC LIMIT 10) \n", " AS donation_totals \n", " ON donors.donor_id = donation_totals.canon_id\n", + " ORDER BY totals DESC\n", " '''\n", - " cur_dict = dict_cursor_execute(cur, q).to_dict('records')\n", + " cur_dict = as_pandas(q).to_dict('records')\n", "\n", " print(\"Top Donors (deduped)\")\n", " for row in cur_dict:\n", " row['totals'] = locale.currency(row['totals'], grouping=True)\n", - " print('%(totals)20s: %(name)s' % row)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + " print('%(totals)20s: %(name)s' % row)\n", + "\n", " # Compare this to what we would have gotten if we hadn't done any\n", " # deduplication\n", "\n", " q = '''\n", - " SELECT array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name,\n", - " SUM(cast(contributions.amount as double)) AS totals \n", - " FROM donors INNER JOIN contributions \n", - " USING (donor_id) \n", - " GROUP BY donor_id), name\n", + " with donorscontributions as(\n", + "\n", + " SELECT donors.donor_id, \n", + " array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name,\n", + " cast(contributions.amount as double) as amount\n", + " FROM donors INNER JOIN contributions \n", + " USING (donor_id) \n", + " )\n", + " SELECT name, sum(amount) AS totals \n", + " FROM donorscontributions\n", + " GROUP BY donor_id, name\n", " ORDER BY totals DESC \n", - " LIMIT 10\")\n", + " LIMIT 10\n", " '''\n", "\n", - " cur_dict = dict_cursor_execute(cur, q).to_dict('records')\n", + " cur_dict = as_pandas(q).to_dict('records')\n", "\n", " print(\"Top Donors (raw)\")\n", - " for row in cur:\n", + " for row in cur_dict:\n", " row['totals'] = locale.currency(row['totals'], grouping=True)\n", " print('%(totals)20s: %(name)s' % row)\n", "\n", @@ -622,73 +428,22 @@ " print('ran in', time.time() - start_time, 'seconds')" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# I'm here\n", - "Found a way to map block_key to block_numbers\n", - "** CREATE TABLE, according to some thing online, has more timeout!\n", - "** Looks like i should be using (bucketing)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Problem:\n", - "The athena mapping doesn't have many distinct values, a huge number for example have 6061:None:2, while there is only one like this in sql!?\n", - "The problem, probably was probably address, the concat was buggy and there were too many nulls.\n", - "Still while raw table matches, donors don't! The athena is too much bigger\n", - "Start from here: Run this query on both, the results are different" - ] - }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], - "source": [ - "create table as_blocking_map_number\n", - "with (bucketed_by = block_number)\n", - "as( \n", - " SELECT donor_id, dense_rank() over (ORDER BY block_key) as block_number\n", - " from blocking_map)\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[NbConvertApp] Converting notebook athena_example.ipynb to script\n", + "[NbConvertApp] Writing 11731 bytes to ../athena_example/athena_example.py\n" + ] + } + ], "source": [ - "%%time\n", - "import sys\n", - "sys.path.insert(0, '../athena_example/')\n", - "from pyathena import connect\n", - "from pyathena.pandas_cursor import PandasCursor\n", - "\n", - "import config\n", - "\n", - "conn = connect(aws_access_key_id=config.ACCESS_KEY_ID,\n", - " aws_secret_access_key=config.SECRET_ACCESS_KEY,\n", - " s3_staging_dir=config.ATHENA_GARBAGE_PATH,\n", - " region_name=config.REGION, \n", - " work_group=config.WORKGROUP) \n", - "cur = conn.cursor(PandasCursor, schema_name=config.SCHEMA_NAME)\n", - "q='''\n", - "with blocking_map_number as( \n", - " SELECT donor_id, dense_rank() over (ORDER BY block_key) as block_number\n", - " from blocking_map)\n", - "create table donor_id_pairs as (\n", - " SELECT DISTINCT l.donor_id as east, r.donor_id as west\n", - " from blocking_map_number as l\n", - " INNER JOIN blocking_map_number as r\n", - " using (block_number)\n", - " where l.donor_id < r.donor_id)\n", - "'''\n", - "cur.execute(q)" + "!jupyter nbconvert --to script athena_example.ipynb --output-dir=../athena_example/" ] }, { @@ -716,6 +471,13 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } } }, "nbformat": 4, diff --git a/notebooks/athena_init_db.ipynb b/notebooks/athena_init_db.ipynb index 5e8a5a32..47e75969 100644 --- a/notebooks/athena_init_db.ipynb +++ b/notebooks/athena_init_db.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -15,13 +15,14 @@ ], "source": [ "%%writefile ../athena_example/config.py\n", + "LOG_FILE = 'log.txt'\n", "# Connection parameters\n", "ACCESS_KEY_ID = None\n", "SECRET_ACCESS_KEY = None\n", "ATHENA_GARBAGE_PATH = 's3://com.ria.scratch/athena_garbage/'\n", "WORKGROUP = 'RIA'\n", "REGION = 'eu-west-1'\n", - "SCHEMA_NAME = 'ria_data_science_s3'\n", + "DATABASE = 'ria_data_science_s3'\n", "\n", "# Database Parameters\n", "DATABASE_BUCKET = 'com.ria.scratch'\n", @@ -30,19 +31,19 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Overwriting ../athena_example/athena_example.py\n" + "Writing ../athena_example/athena_init.py\n" ] } ], "source": [ - "%%writefile ../athena_example/athena_example.py\n", + "%%writefile ../athena_example/athena_init.py\n", "#!/usr/bin/python\n", "\"\"\"\n", "This is a setup script for athena_example. It downloads a zip file of\n", @@ -66,9 +67,11 @@ "import numpy as np\n", "from urllib.request import urlopen\n", "import boto3\n", - "from pyathena import connect\n", "import config\n", "import csv\n", + "import sys\n", + "sys.path.insert(0, '../athena_example/')\n", + "import utils\n", "\n", "\n", "contributions_zip_file = 'Illinois-campaign-contributions.txt.zip'\n", @@ -92,15 +95,6 @@ "\n", "\n", "\n", - "def as_pandas(query, **kwrgs):\n", - " return utils.athena_to_panda(query, escapechar='\\\\', dtype='object', keep_default_na=False, na_values=[''], **kwrgs)\n", - "\n", - "conn = connect(aws_access_key_id=config.ACCESS_KEY_ID,\n", - " aws_secret_access_key=config.SECRET_ACCESS_KEY,\n", - " s3_staging_dir=config.ATHENA_GARBAGE_PATH,\n", - " region_name=config.REGION, \n", - " work_group=config.WORKGROUP)\n", - "c = conn.cursor(schema_name=config.SCHEMA_NAME)\n", "\n", "print('importing raw data from csv...')\n", "utils.athena_start_query(\"DROP TABLE IF EXISTS raw_table\")\n", @@ -127,8 +121,8 @@ " committee_name VARCHAR(70), committee_id VARCHAR(37)) \n", "ROW FORMAT DELIMITED\n", " FIELDS TERMINATED BY '\\t'\n", - " LINES TERMINATED BY '\\n' \n", " ESCAPED BY '\\\\'\n", + " LINES TERMINATED BY '\\n' \n", "LOCATION\n", " 's3://{}/{}' \n", "TBLPROPERTIES (\n", @@ -164,20 +158,28 @@ "\n", "# Nullifying empty strings\n", "# df = df.replace(r'^\\s*$', np.nan, regex=True)\n", - "df_lower=df.apply(lambda x: x.str.lower() if x.dtype=='object' else x, result_type='expand')\n", + "df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand')\n", + "\n", "utils.write(body=df_lower.to_csv(quoting=csv.QUOTE_NONE, sep=\"\\t\", escapechar='\\\\', index=None),\n", " filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'raw_table', contributions_txt_file,))\n", "\n", + "# Athena is doesn't equate empty string and null, eventhough in the table spec we said so\n", + "# Not that it's a bug, it works if the string is null in the source, but not after applying trim to it\n", + "# So we need to manually take care of that\n", "print('creating donors table...')\n", "q='''\n", "CREATE TABLE donors as\n", " with tmp as\n", " (SELECT DISTINCT \n", - " TRIM(last_name) as last_name, TRIM(first_name) as first_name, \n", - " TRIM(address_1) as address_1, TRIM(address_2) as address_2, \n", - " TRIM(city) city, TRIM(state) as state, \n", - " TRIM(zip) as zip, TRIM(employer) as employer, \n", - " TRIM(occupation) as occupation\n", + " NULLIF(TRIM(last_name), '') as last_name, \n", + " NULLIF(TRIM(first_name), '') as first_name, \n", + " NULLIF(TRIM(address_1), '') as address_1, \n", + " NULLIF(TRIM(address_2), '') as address_2, \n", + " NULLIF(TRIM(city), '') city, \n", + " NULLIF(TRIM(state), '') as state, \n", + " NULLIF(TRIM(zip), '') as zip, \n", + " NULLIF(TRIM(employer), '') as employer, \n", + " NULLIF(TRIM(occupation), '') as occupation\n", " FROM raw_table)\n", " SELECT row_number() over () as donor_id, * from tmp'''\n", "utils.athena_start_query(q)\n", @@ -185,14 +187,33 @@ "\n", "q='''\n", "CREATE TABLE recipients as\n", - " SELECT DISTINCT committee_id, committee_name FROM raw_table\n", + " SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM raw_table\n", "'''\n", "utils.athena_start_query(q)\n", "\n", "print('creating contributions table')\n", + "\n", + "# --\n", + "# c.execute(\"CREATE TABLE contributions \"\n", + "# \"(contribution_id INT, donor_id INT, recipient_id INT, \"\n", + "# \" report_type VARCHAR(24), date_recieved DATE, \"\n", + "# \" loan_amount VARCHAR(12), amount VARCHAR(23), \"\n", + "# \" receipt_type VARCHAR(23), \"\n", + "# \" vendor_last_name VARCHAR(70), \"\n", + "# \" vendor_first_name VARCHAR(20), \"\n", + "# \" vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), \"\n", + "# \" vendor_city VARCHAR(20), vendor_state VARCHAR(10), \"\n", + "# \" vendor_zip VARCHAR(10), description VARCHAR(90), \"\n", + "# \" election_type VARCHAR(10), election_year VARCHAR(10), \"\n", + "# \" report_period_begin DATE, report_period_end DATE) \"\n", + "# \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n", + "# --\n", + "\n", "q='''\n", "CREATE TABLE contributions as\n", - " SELECT reciept_id, donors.donor_id, committee_id, \n", + " SELECT reciept_id as contribution_id, \n", + " donors.donor_id as donor_id , \n", + " committee_id as recipient_id, \n", " report_type, date_parse(date_recieved, '%m/%d/%Y') as date_recieved, \n", " loan_amount, amount, \n", " receipt_type, vendor_last_name , \n", @@ -202,15 +223,16 @@ " date_parse(report_period_begin, '%m/%d/%Y') as report_period_begin, \n", " date_parse(report_period_end, '%m/%d/%Y') as report_period_end \n", " FROM raw_table JOIN donors ON \n", - " donors.first_name = TRIM(raw_table.first_name) AND \n", - " donors.last_name = TRIM(raw_table.last_name) AND \n", - " donors.address_1 = TRIM(raw_table.address_1) AND \n", - " donors.address_2 = TRIM(raw_table.address_2) AND \n", - " donors.city = TRIM(raw_table.city) AND \n", - " donors.state = TRIM(raw_table.state) AND \n", - " donors.employer = TRIM(raw_table.employer) AND \n", - " donors.occupation = TRIM(raw_table.occupation) AND \n", - " donors.zip = TRIM(raw_table.zip)'''\n", + " coalesce(donors.first_name, '') = coalesce(TRIM(raw_table.first_name), '') AND \n", + " coalesce(donors.last_name, '') = coalesce(TRIM(raw_table.last_name), '') AND \n", + " coalesce(donors.address_1, '') = coalesce(TRIM(raw_table.address_1), '') AND \n", + " coalesce(donors.address_2, '') = coalesce(TRIM(raw_table.address_2), '') AND \n", + " coalesce(donors.city, '') = coalesce(TRIM(raw_table.city), '') AND \n", + " coalesce(donors.state, '') = coalesce(TRIM(raw_table.state), '') AND \n", + " coalesce(donors.employer, '') = coalesce(TRIM(raw_table.employer), '') AND \n", + " coalesce(donors.occupation , '')= coalesce(TRIM(raw_table.occupation), '') AND \n", + " coalesce(donors.zip, '') = coalesce(TRIM(raw_table.zip), '')'''\n", + "\n", "utils.athena_start_query(q)\n", "\n", "q = '''\n", @@ -225,7 +247,7 @@ " LOWER(state) AS state, \n", " CASE WHEN (address_1 IS NULL AND address_2 IS NULL) \n", " THEN NULL \n", - " ELSE LOWER(array_join(filter(array[address_1, address_1], x-> x IS NOT NULL), ' '))\n", + " ELSE LOWER(array_join(filter(array[address_1, address_2], x-> x IS NOT NULL), ' '))\n", " END AS address, \n", " LOWER(occupation) AS occupation, \n", " LOWER(employer) AS employer, \n", @@ -241,7 +263,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -254,11 +276,6 @@ "b'Skipping line 1495732: expected 30 fields, saw 31\\n'\n", "b'Skipping line 1631504: expected 30 fields, saw 31\\nSkipping line 1631506: expected 30 fields, saw 31\\n'\n", "b'Skipping line 1660260: expected 30 fields, saw 31\\nSkipping line 1660264: expected 30 fields, saw 32\\n'\n", - "b'Skipping line 1441352: expected 30 fields, saw 31\\n'\n", - "b'Skipping line 1465996: expected 30 fields, saw 31\\n'\n", - "b'Skipping line 1495732: expected 30 fields, saw 31\\n'\n", - "b'Skipping line 1631504: expected 30 fields, saw 31\\nSkipping line 1631506: expected 30 fields, saw 31\\n'\n", - "b'Skipping line 1660260: expected 30 fields, saw 31\\nSkipping line 1660264: expected 30 fields, saw 32\\n'\n", "creating donors table...\n", "creating contributions table\n", "done\n" @@ -266,7 +283,7 @@ } ], "source": [ - "!python ../athena_example/athena_example.py" + "!python ../athena_example/athena_init.py" ] }, { From da660eb8f8ecbc7cccbaac7b463119b175c7dec3 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Thu, 29 Oct 2020 15:17:02 +0000 Subject: [PATCH 08/19] checkpoint --- athena_example/athena_init.py | 52 ++++++++-------- athena_example/config.py | 8 +-- athena_example/utils.py | 1 + notebooks/athena_example.ipynb | 107 +++++++++++++++++---------------- notebooks/athena_init_db.ipynb | 92 +++++++++++----------------- 5 files changed, 120 insertions(+), 140 deletions(-) diff --git a/athena_example/athena_init.py b/athena_example/athena_init.py index 9ddb14c8..f8bac6e0 100644 --- a/athena_example/athena_init.py +++ b/athena_example/athena_init.py @@ -8,7 +8,7 @@ [athena_example.py](athena_example.py). Tables created: -* raw_table - raw import of entire CSV file +* as_raw_table - raw import of entire CSV file * donors - all distinct donors based on name and address * recipients - all distinct campaign contribution recipients * contributions - contribution amounts tied to donor and recipients tables @@ -51,15 +51,15 @@ print('importing raw data from csv...') -utils.athena_start_query("DROP TABLE IF EXISTS raw_table") -utils.athena_start_query("DROP TABLE IF EXISTS donors") -utils.athena_start_query("DROP TABLE IF EXISTS recipients") -utils.athena_start_query("DROP TABLE IF EXISTS contributions") -utils.athena_start_query("DROP TABLE IF EXISTS processed_donors") +utils.athena_start_query("DROP TABLE IF EXISTS as_raw_table") +utils.athena_start_query("DROP TABLE IF EXISTS as_donors") +utils.athena_start_query("DROP TABLE IF EXISTS as_recipients") +utils.athena_start_query("DROP TABLE IF EXISTS as_contributions") +utils.athena_start_query("DROP TABLE IF EXISTS as_processed_donors") q=r''' -CREATE EXTERNAL TABLE raw_table +CREATE EXTERNAL TABLE as_raw_table (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), @@ -83,7 +83,7 @@ 'classification'='csv', 'skip.header.line.count'='1', 'serialization.null.format'='') -'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'raw_table') +'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table') utils.athena_start_query(q) @@ -115,14 +115,14 @@ df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand') utils.write(body=df_lower.to_csv(quoting=csv.QUOTE_NONE, sep="\t", escapechar='\\', index=None), - filename=os.path.join("s3://", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'raw_table', contributions_txt_file,)) + filename=os.path.join("s3://", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'as_raw_table', contributions_txt_file,)) # Athena is doesn't equate empty string and null, eventhough in the table spec we said so # Not that it's a bug, it works if the string is null in the source, but not after applying trim to it # So we need to manually take care of that print('creating donors table...') q=''' -CREATE TABLE donors as +CREATE TABLE as_donors as with tmp as (SELECT DISTINCT NULLIF(TRIM(last_name), '') as last_name, @@ -134,14 +134,14 @@ NULLIF(TRIM(zip), '') as zip, NULLIF(TRIM(employer), '') as employer, NULLIF(TRIM(occupation), '') as occupation - FROM raw_table) + FROM as_raw_table) SELECT row_number() over () as donor_id, * from tmp''' utils.athena_start_query(q) q=''' -CREATE TABLE recipients as - SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM raw_table +CREATE TABLE as_recipients as + SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM as_raw_table ''' utils.athena_start_query(q) @@ -164,7 +164,7 @@ # -- q=''' -CREATE TABLE contributions as +CREATE TABLE as_contributions as SELECT reciept_id as contribution_id, donors.donor_id as donor_id , committee_id as recipient_id, @@ -176,21 +176,21 @@ election_type, election_year, date_parse(report_period_begin, '%m/%d/%Y') as report_period_begin, date_parse(report_period_end, '%m/%d/%Y') as report_period_end - FROM raw_table JOIN donors ON - coalesce(donors.first_name, '') = coalesce(TRIM(raw_table.first_name), '') AND - coalesce(donors.last_name, '') = coalesce(TRIM(raw_table.last_name), '') AND - coalesce(donors.address_1, '') = coalesce(TRIM(raw_table.address_1), '') AND - coalesce(donors.address_2, '') = coalesce(TRIM(raw_table.address_2), '') AND - coalesce(donors.city, '') = coalesce(TRIM(raw_table.city), '') AND - coalesce(donors.state, '') = coalesce(TRIM(raw_table.state), '') AND - coalesce(donors.employer, '') = coalesce(TRIM(raw_table.employer), '') AND - coalesce(donors.occupation , '')= coalesce(TRIM(raw_table.occupation), '') AND - coalesce(donors.zip, '') = coalesce(TRIM(raw_table.zip), '')''' + FROM as_raw_table JOIN as_donors donors ON + coalesce(donors.first_name, '') = coalesce(TRIM(as_raw_table.first_name), '') AND + coalesce(donors.last_name, '') = coalesce(TRIM(as_raw_table.last_name), '') AND + coalesce(donors.address_1, '') = coalesce(TRIM(as_raw_table.address_1), '') AND + coalesce(donors.address_2, '') = coalesce(TRIM(as_raw_table.address_2), '') AND + coalesce(donors.city, '') = coalesce(TRIM(as_raw_table.city), '') AND + coalesce(donors.state, '') = coalesce(TRIM(as_raw_table.state), '') AND + coalesce(donors.employer, '') = coalesce(TRIM(as_raw_table.employer), '') AND + coalesce(donors.occupation , '')= coalesce(TRIM(as_raw_table.occupation), '') AND + coalesce(donors.zip, '') = coalesce(TRIM(as_raw_table.zip), '')''' utils.athena_start_query(q) q = ''' -CREATE TABLE processed_donors AS +CREATE TABLE as_processed_donors AS SELECT donor_id, LOWER(city) AS city, CASE WHEN (first_name IS NULL AND last_name IS NULL) @@ -206,7 +206,7 @@ LOWER(occupation) AS occupation, LOWER(employer) AS employer, first_name is null AS person - FROM donors''' + FROM as_donors''' utils.athena_start_query(q) diff --git a/athena_example/config.py b/athena_example/config.py index 60964c73..3715b750 100644 --- a/athena_example/config.py +++ b/athena_example/config.py @@ -2,11 +2,11 @@ # Connection parameters ACCESS_KEY_ID = None SECRET_ACCESS_KEY = None -ATHENA_GARBAGE_PATH = 's3://com.ria.scratch/athena_garbage/' -WORKGROUP = 'RIA' +ATHENA_GARBAGE_PATH = 's3://aws-athena-query-results-rds' +WORKGROUP = 'RDS' REGION = 'eu-west-1' -DATABASE = 'ria_data_science_s3' +DATABASE = 'ria_tmp' # Database Parameters -DATABASE_BUCKET = 'com.ria.scratch' +DATABASE_BUCKET = 'ria-temp' DATABASE_ROOT_KEY = 'as-dedupe/' diff --git a/athena_example/utils.py b/athena_example/utils.py index 77f18fda..1b8b935a 100644 --- a/athena_example/utils.py +++ b/athena_example/utils.py @@ -73,6 +73,7 @@ def is_s3_url(url): return urlparse(url).scheme in ["s3", "s3n", "s3a"] except Exception: return False + def seperate_bucket_key(url): m = re.match('s3://([^/]+)/(.*)', url) return m.group(1), m.group(2) diff --git a/notebooks/athena_example.ipynb b/notebooks/athena_example.ipynb index ab222233..da896697 100644 --- a/notebooks/athena_example.ipynb +++ b/notebooks/athena_example.ipynb @@ -1,5 +1,14 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install dedupe" + ] + }, { "cell_type": "code", "execution_count": null, @@ -75,7 +84,8 @@ " yield donor_id, cluster_id, score\n", "\n", "\n", - "if __name__ == '__main__':\n", + "# if __name__ == '__main__':\n", + "if True:\n", "\n", " # ## Logging\n", "\n", @@ -83,17 +93,17 @@ " # for convenience. To enable verbose output, run `python\n", " # examples/mysql_example/mysql_example.py -v`\n", " \n", - " optp = optparse.OptionParser()\n", - " optp.add_option('-v', '--verbose', dest='verbose', action='count',\n", - " help='Increase verbosity (specify multiple times for more)'\n", - " )\n", - " (opts, args) = optp.parse_args()\n", + "# optp = optparse.OptionParser()\n", + "# optp.add_option('-v', '--verbose', dest='verbose', action='count',\n", + "# help='Increase verbosity (specify multiple times for more)'\n", + "# )\n", + "# (opts, args) = optp.parse_args()\n", " log_level = logging.WARNING\n", - " if opts.verbose:\n", - " if opts.verbose == 1:\n", - " log_level = logging.INFO\n", - " elif opts.verbose >= 2:\n", - " log_level = logging.DEBUG\n", + "# if opts.verbose:\n", + "# if opts.verbose == 1:\n", + "# log_level = logging.INFO\n", + "# elif opts.verbose >= 2:\n", + "# log_level = logging.DEBUG\n", "\n", "\n", " logging.getLogger().setLevel(log_level)\n", @@ -119,7 +129,7 @@ " # We did a fair amount of preprocessing of the fields in\n", " # `mysql_init_db.py` \n", " DONOR_SELECT = \"SELECT donor_id, city, name, zip, state, address \" \\\n", - " \"from processed_donors\"\n", + " \"from as_processed_donors\"\n", "\n", " # ## Training\n", "\n", @@ -209,11 +219,11 @@ "\n", " # To run blocking on such a large set of data, we create a separate table\n", " # that contains blocking keys and record ids\n", - " print('creating blocking_map database')\n", - " utils.athena_start_query(\"DROP TABLE IF EXISTS blocking_map\")\n", + " print('creating as_blocking_map database')\n", + " utils.athena_start_query(\"DROP TABLE IF EXISTS as_blocking_map\")\n", "\n", " q='''\n", - " CREATE EXTERNAL TABLE blocking_map \n", + " CREATE EXTERNAL TABLE as_blocking_map \n", " (block_key VARCHAR(200), donor_id INTEGER)\n", " ROW FORMAT DELIMITED\n", " FIELDS TERMINATED BY '\\t'\n", @@ -224,7 +234,7 @@ " 'classification'='csv', \n", " --'skip.header.line.count'='1', \n", " 'serialization.null.format'='')\n", - " '''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'blocking_map') \n", + " '''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map') \n", " utils.athena_start_query(q)" ] }, @@ -242,7 +252,7 @@ " # This never runs, index_fields is empty, possible bug?\n", " for field in deduper.fingerprinter.index_fields:\n", " q = '''\n", - " SELECT DISTINCT {field} FROM processed_donors \n", + " SELECT DISTINCT {field} FROM as_processed_donors \n", " WHERE {field} IS NOT NULL\n", " '''.format(field=field)\n", " cur_df = as_pandas(q)\n", @@ -274,7 +284,8 @@ "outputs": [], "source": [ " b_data = deduper.fingerprinter(full_data)\n", - " buffer = pd.DataFrame.from_records(b_data).to_csv(index=False, header=False, sep='\\t') utils.s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'blocking_map/blocking.csv', Body=buffer) \n" + " buffer = pd.DataFrame.from_records(b_data).to_csv(index=False, header=False, sep='\\t')\n", + " utils.s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'as_blocking_map/blocking.csv', Body=buffer) \n" ] }, { @@ -295,12 +306,12 @@ " ARRAY[ b.city, b.name, b.zip, b.state, b.address])\n", " AS JSON))\n", " FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west\n", - " from blocking_map as l\n", - " INNER JOIN blocking_map as r\n", + " from as_blocking_map as l\n", + " INNER JOIN as_blocking_map as r\n", " using (block_key)\n", " where l.donor_id < r.donor_id) ids\n", - " INNER JOIN processed_donors a on ids.east=a.donor_id\n", - " INNER JOIN processed_donors b on ids.west=b.donor_id\n", + " INNER JOIN as_processed_donors a on ids.east=a.donor_id\n", + " INNER JOIN as_processed_donors b on ids.west=b.donor_id\n", " '''\n", " read_cur_dict=as_pandas(q).itertuples(index=False, name=None)" ] @@ -324,11 +335,11 @@ "metadata": {}, "outputs": [], "source": [ - " utils.athena_start_query(\"DROP TABLE IF EXISTS entity_map\")\n", + " utils.athena_start_query(\"DROP TABLE IF EXISTS as_entity_map\")\n", "\n", - " print('creating entity_map database')\n", + " print('creating as_entity_map database')\n", " q='''\n", - " CREATE EXTERNAL TABLE entity_map \n", + " CREATE EXTERNAL TABLE as_entity_map \n", " (donor_id INTEGER, canon_id INTEGER, \n", " cluster_score FLOAT)\n", " ROW FORMAT DELIMITED\n", @@ -340,11 +351,11 @@ " 'classification'='csv', \n", " --'skip.header.line.count'='1', \n", " 'serialization.null.format'='')\n", - " '''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'entity_map') \n", + " '''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map') \n", " utils.athena_start_query(q) \n", "\n", " buffer = pd.DataFrame.from_records(cluster_ids(clustered_dupes)).to_csv(index=False, header=False, sep='\\t')\n", - " utils.s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'entity_map/entity_map.csv', Body=buffer) \n" + " utils.s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'as_entity_map/as_entity_map.csv', Body=buffer) \n" ] }, { @@ -365,27 +376,28 @@ "\n", " locale.setlocale(locale.LC_ALL, 'en_CA.UTF-8') # for pretty printing numbers\n", " \n", - " utils.athena_start_query(\"DROP TABLE IF EXISTS e_map\")\n", + " utils.athena_start_query(\"DROP TABLE IF EXISTS as_e_map\")\n", " q = '''\n", - " CREATE TABLE e_map as \n", - " SELECT COALESCE(canon_id, entity_map.donor_id) AS canon_id, entity_map.donor_id \n", - " FROM entity_map \n", - " RIGHT JOIN donors USING(donor_id)\n", + " CREATE TABLE as_e_map as \n", + " SELECT COALESCE(canon_id, as_entity_map.donor_id) AS canon_id, as_entity_map.donor_id \n", + " FROM as_entity_map \n", + " RIGHT JOIN as_donors USING(donor_id)\n", + " \n", " '''\n", " \n", " utils.athena_start_query(q)\n", " q ='''\n", - " SELECT array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name, \n", + " SELECT array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name, \n", " donation_totals.totals AS totals \n", - " FROM donors INNER JOIN \n", + " FROM as_donors INNER JOIN \n", " (SELECT canon_id, SUM(cast (amount as double)) AS totals \n", - " FROM contributions INNER JOIN e_map \n", + " FROM as_contributions INNER JOIN as_e_map \n", " USING (donor_id) \n", " GROUP BY (canon_id) \n", " ORDER BY totals \n", " DESC LIMIT 10) \n", " AS donation_totals \n", - " ON donors.donor_id = donation_totals.canon_id\n", + " ON as_donors.donor_id = donation_totals.canon_id\n", " ORDER BY totals DESC\n", " '''\n", " cur_dict = as_pandas(q).to_dict('records')\n", @@ -401,10 +413,10 @@ " q = '''\n", " with donorscontributions as(\n", "\n", - " SELECT donors.donor_id, \n", - " array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name,\n", - " cast(contributions.amount as double) as amount\n", - " FROM donors INNER JOIN contributions \n", + " SELECT as_donors.donor_id, \n", + " array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,\n", + " cast(as_contributions.amount as double) as amount\n", + " FROM as_donors INNER JOIN as_contributions \n", " USING (donor_id) \n", " )\n", " SELECT name, sum(amount) AS totals \n", @@ -430,20 +442,11 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[NbConvertApp] Converting notebook athena_example.ipynb to script\n", - "[NbConvertApp] Writing 11731 bytes to ../athena_example/athena_example.py\n" - ] - } - ], + "outputs": [], "source": [ - "!jupyter nbconvert --to script athena_example.ipynb --output-dir=../athena_example/" + "# !jupyter nbconvert --to script athena_example.ipynb --output-dir=../athena_example/" ] }, { diff --git a/notebooks/athena_init_db.ipynb b/notebooks/athena_init_db.ipynb index 47e75969..19e6f600 100644 --- a/notebooks/athena_init_db.ipynb +++ b/notebooks/athena_init_db.ipynb @@ -2,30 +2,22 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Overwriting ../athena_example/config.py\n" - ] - } - ], + "outputs": [], "source": [ "%%writefile ../athena_example/config.py\n", "LOG_FILE = 'log.txt'\n", "# Connection parameters\n", "ACCESS_KEY_ID = None\n", "SECRET_ACCESS_KEY = None\n", - "ATHENA_GARBAGE_PATH = 's3://com.ria.scratch/athena_garbage/'\n", - "WORKGROUP = 'RIA'\n", + "ATHENA_GARBAGE_PATH = 's3://aws-athena-query-results-rds'\n", + "WORKGROUP = 'RDS'\n", "REGION = 'eu-west-1'\n", - "DATABASE = 'ria_data_science_s3'\n", + "DATABASE = 'ria_tmp'\n", "\n", "# Database Parameters\n", - "DATABASE_BUCKET = 'com.ria.scratch'\n", + "DATABASE_BUCKET = 'ria-temp'\n", "DATABASE_ROOT_KEY = 'as-dedupe/'" ] }, @@ -38,7 +30,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Writing ../athena_example/athena_init.py\n" + "Overwriting ../athena_example/athena_init.py\n" ] } ], @@ -54,7 +46,7 @@ "[athena_example.py](athena_example.py).\n", " \n", "Tables created:\n", - "* raw_table - raw import of entire CSV file\n", + "* as_raw_table - raw import of entire CSV file\n", "* donors - all distinct donors based on name and address\n", "* recipients - all distinct campaign contribution recipients\n", "* contributions - contribution amounts tied to donor and recipients tables\n", @@ -97,15 +89,15 @@ "\n", "\n", "print('importing raw data from csv...')\n", - "utils.athena_start_query(\"DROP TABLE IF EXISTS raw_table\")\n", - "utils.athena_start_query(\"DROP TABLE IF EXISTS donors\")\n", - "utils.athena_start_query(\"DROP TABLE IF EXISTS recipients\")\n", - "utils.athena_start_query(\"DROP TABLE IF EXISTS contributions\")\n", - "utils.athena_start_query(\"DROP TABLE IF EXISTS processed_donors\")\n", + "utils.athena_start_query(\"DROP TABLE IF EXISTS as_raw_table\")\n", + "utils.athena_start_query(\"DROP TABLE IF EXISTS as_donors\")\n", + "utils.athena_start_query(\"DROP TABLE IF EXISTS as_recipients\")\n", + "utils.athena_start_query(\"DROP TABLE IF EXISTS as_contributions\")\n", + "utils.athena_start_query(\"DROP TABLE IF EXISTS as_processed_donors\")\n", "\n", "\n", "q=r'''\n", - "CREATE EXTERNAL TABLE raw_table \n", + "CREATE EXTERNAL TABLE as_raw_table \n", " (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), \n", " address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), \n", " state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), \n", @@ -129,7 +121,7 @@ " 'classification'='csv', \n", " 'skip.header.line.count'='1', \n", " 'serialization.null.format'='')\n", - "'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'raw_table') \n", + "'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table') \n", "utils.athena_start_query(q)\n", "\n", "\n", @@ -161,14 +153,14 @@ "df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand')\n", "\n", "utils.write(body=df_lower.to_csv(quoting=csv.QUOTE_NONE, sep=\"\\t\", escapechar='\\\\', index=None),\n", - " filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'raw_table', contributions_txt_file,))\n", + " filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'as_raw_table', contributions_txt_file,))\n", "\n", "# Athena is doesn't equate empty string and null, eventhough in the table spec we said so\n", "# Not that it's a bug, it works if the string is null in the source, but not after applying trim to it\n", "# So we need to manually take care of that\n", "print('creating donors table...')\n", "q='''\n", - "CREATE TABLE donors as\n", + "CREATE TABLE as_donors as\n", " with tmp as\n", " (SELECT DISTINCT \n", " NULLIF(TRIM(last_name), '') as last_name, \n", @@ -180,14 +172,14 @@ " NULLIF(TRIM(zip), '') as zip, \n", " NULLIF(TRIM(employer), '') as employer, \n", " NULLIF(TRIM(occupation), '') as occupation\n", - " FROM raw_table)\n", + " FROM as_raw_table)\n", " SELECT row_number() over () as donor_id, * from tmp'''\n", "utils.athena_start_query(q)\n", "\n", "\n", "q='''\n", - "CREATE TABLE recipients as\n", - " SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM raw_table\n", + "CREATE TABLE as_recipients as\n", + " SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM as_raw_table\n", "'''\n", "utils.athena_start_query(q)\n", "\n", @@ -210,7 +202,7 @@ "# --\n", "\n", "q='''\n", - "CREATE TABLE contributions as\n", + "CREATE TABLE as_contributions as\n", " SELECT reciept_id as contribution_id, \n", " donors.donor_id as donor_id , \n", " committee_id as recipient_id, \n", @@ -222,21 +214,21 @@ " election_type, election_year, \n", " date_parse(report_period_begin, '%m/%d/%Y') as report_period_begin, \n", " date_parse(report_period_end, '%m/%d/%Y') as report_period_end \n", - " FROM raw_table JOIN donors ON \n", - " coalesce(donors.first_name, '') = coalesce(TRIM(raw_table.first_name), '') AND \n", - " coalesce(donors.last_name, '') = coalesce(TRIM(raw_table.last_name), '') AND \n", - " coalesce(donors.address_1, '') = coalesce(TRIM(raw_table.address_1), '') AND \n", - " coalesce(donors.address_2, '') = coalesce(TRIM(raw_table.address_2), '') AND \n", - " coalesce(donors.city, '') = coalesce(TRIM(raw_table.city), '') AND \n", - " coalesce(donors.state, '') = coalesce(TRIM(raw_table.state), '') AND \n", - " coalesce(donors.employer, '') = coalesce(TRIM(raw_table.employer), '') AND \n", - " coalesce(donors.occupation , '')= coalesce(TRIM(raw_table.occupation), '') AND \n", - " coalesce(donors.zip, '') = coalesce(TRIM(raw_table.zip), '')'''\n", + " FROM as_raw_table JOIN as_donors donors ON \n", + " coalesce(donors.first_name, '') = coalesce(TRIM(as_raw_table.first_name), '') AND \n", + " coalesce(donors.last_name, '') = coalesce(TRIM(as_raw_table.last_name), '') AND \n", + " coalesce(donors.address_1, '') = coalesce(TRIM(as_raw_table.address_1), '') AND \n", + " coalesce(donors.address_2, '') = coalesce(TRIM(as_raw_table.address_2), '') AND \n", + " coalesce(donors.city, '') = coalesce(TRIM(as_raw_table.city), '') AND \n", + " coalesce(donors.state, '') = coalesce(TRIM(as_raw_table.state), '') AND \n", + " coalesce(donors.employer, '') = coalesce(TRIM(as_raw_table.employer), '') AND \n", + " coalesce(donors.occupation , '')= coalesce(TRIM(as_raw_table.occupation), '') AND \n", + " coalesce(donors.zip, '') = coalesce(TRIM(as_raw_table.zip), '')'''\n", "\n", "utils.athena_start_query(q)\n", "\n", "q = '''\n", - "CREATE TABLE processed_donors AS \n", + "CREATE TABLE as_processed_donors AS \n", " SELECT donor_id, \n", " LOWER(city) AS city, \n", " CASE WHEN (first_name IS NULL AND last_name IS NULL) \n", @@ -252,7 +244,7 @@ " LOWER(occupation) AS occupation, \n", " LOWER(employer) AS employer, \n", " first_name is null AS person \n", - " FROM donors'''\n", + " FROM as_donors'''\n", "utils.athena_start_query(q)\n", "\n", "\n", @@ -265,23 +257,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "importing raw data from csv...\n", - "b'Skipping line 1441352: expected 30 fields, saw 31\\n'\n", - "b'Skipping line 1465996: expected 30 fields, saw 31\\n'\n", - "b'Skipping line 1495732: expected 30 fields, saw 31\\n'\n", - "b'Skipping line 1631504: expected 30 fields, saw 31\\nSkipping line 1631506: expected 30 fields, saw 31\\n'\n", - "b'Skipping line 1660260: expected 30 fields, saw 31\\nSkipping line 1660264: expected 30 fields, saw 32\\n'\n", - "creating donors table...\n", - "creating contributions table\n", - "done\n" - ] - } - ], + "outputs": [], "source": [ "!python ../athena_example/athena_init.py" ] From d5f807a9e053c46d8564a9fe35cf3f2e25dda5da Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Fri, 30 Oct 2020 20:48:10 +0000 Subject: [PATCH 09/19] probably working --- athena_example/athena_init.py | 44 +- athena_example/{utils.py => athenautils.py} | 70 +- athena_example/config.py | 3 +- notebooks/athena_example.ipynb | 1025 ++++++++++++++++--- notebooks/athena_init_db.ipynb | 81 +- 5 files changed, 1031 insertions(+), 192 deletions(-) rename athena_example/{utils.py => athenautils.py} (61%) diff --git a/athena_example/athena_init.py b/athena_example/athena_init.py index f8bac6e0..c8b5b3ea 100644 --- a/athena_example/athena_init.py +++ b/athena_example/athena_init.py @@ -25,7 +25,7 @@ import csv import sys sys.path.insert(0, '../athena_example/') -import utils +import athenautils contributions_zip_file = 'Illinois-campaign-contributions.txt.zip' @@ -51,14 +51,14 @@ print('importing raw data from csv...') -utils.athena_start_query("DROP TABLE IF EXISTS as_raw_table") -utils.athena_start_query("DROP TABLE IF EXISTS as_donors") -utils.athena_start_query("DROP TABLE IF EXISTS as_recipients") -utils.athena_start_query("DROP TABLE IF EXISTS as_contributions") -utils.athena_start_query("DROP TABLE IF EXISTS as_processed_donors") +athenautils.athena_start_query("DROP TABLE IF EXISTS as_raw_table", database=config.DATABASE) +athenautils.athena_start_query("DROP TABLE IF EXISTS as_donors", database=config.DATABASE) +athenautils.athena_start_query("DROP TABLE IF EXISTS as_recipients", database=config.DATABASE) +athenautils.athena_start_query("DROP TABLE IF EXISTS as_contributions", database=config.DATABASE) +athenautils.athena_start_query("DROP TABLE IF EXISTS as_processed_donors", database=config.DATABASE) -q=r''' +q=r""" CREATE EXTERNAL TABLE as_raw_table (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), @@ -83,8 +83,8 @@ 'classification'='csv', 'skip.header.line.count'='1', 'serialization.null.format'='') -'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table') -utils.athena_start_query(q) +""".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table') +athenautils.athena_start_query(q, database=config.DATABASE) df = pd.read_csv(contributions_txt_file, sep='\t', escapechar='\\', quoting=csv.QUOTE_NONE, @@ -114,14 +114,14 @@ # df = df.replace(r'^\s*$', np.nan, regex=True) df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand') -utils.write(body=df_lower.to_csv(quoting=csv.QUOTE_NONE, sep="\t", escapechar='\\', index=None), +athenautils.write(body=df_lower.to_csv(quoting=csv.QUOTE_NONE, sep="\t", escapechar='\\', index=None), filename=os.path.join("s3://", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'as_raw_table', contributions_txt_file,)) # Athena is doesn't equate empty string and null, eventhough in the table spec we said so # Not that it's a bug, it works if the string is null in the source, but not after applying trim to it # So we need to manually take care of that print('creating donors table...') -q=''' +q=""" CREATE TABLE as_donors as with tmp as (SELECT DISTINCT @@ -135,15 +135,15 @@ NULLIF(TRIM(employer), '') as employer, NULLIF(TRIM(occupation), '') as occupation FROM as_raw_table) - SELECT row_number() over () as donor_id, * from tmp''' -utils.athena_start_query(q) + SELECT row_number() over () as donor_id, * from tmp""" +athenautils.athena_start_query(q, database=config.DATABASE) -q=''' +q=""" CREATE TABLE as_recipients as SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM as_raw_table -''' -utils.athena_start_query(q) +""" +athenautils.athena_start_query(q, database=config.DATABASE) print('creating contributions table') @@ -163,7 +163,7 @@ # "CHARACTER SET utf8 COLLATE utf8_unicode_ci") # -- -q=''' +q=""" CREATE TABLE as_contributions as SELECT reciept_id as contribution_id, donors.donor_id as donor_id , @@ -185,11 +185,11 @@ coalesce(donors.state, '') = coalesce(TRIM(as_raw_table.state), '') AND coalesce(donors.employer, '') = coalesce(TRIM(as_raw_table.employer), '') AND coalesce(donors.occupation , '')= coalesce(TRIM(as_raw_table.occupation), '') AND - coalesce(donors.zip, '') = coalesce(TRIM(as_raw_table.zip), '')''' + coalesce(donors.zip, '') = coalesce(TRIM(as_raw_table.zip), '')""" -utils.athena_start_query(q) +athenautils.athena_start_query(q, database=config.DATABASE) -q = ''' +q = """ CREATE TABLE as_processed_donors AS SELECT donor_id, LOWER(city) AS city, @@ -206,8 +206,8 @@ LOWER(occupation) AS occupation, LOWER(employer) AS employer, first_name is null AS person - FROM as_donors''' -utils.athena_start_query(q) + FROM as_donors""" +athenautils.athena_start_query(q, database=config.DATABASE) diff --git a/athena_example/utils.py b/athena_example/athenautils.py similarity index 61% rename from athena_example/utils.py rename to athena_example/athenautils.py index 1b8b935a..3cd8e4dd 100644 --- a/athena_example/utils.py +++ b/athena_example/athenautils.py @@ -27,13 +27,35 @@ athena = boto3.client('athena', region_name=config.REGION, aws_access_key_id=config.ACCESS_KEY_ID, aws_secret_access_key=config.SECRET_ACCESS_KEY) -def athena_to_panda(query, database=config.DATABASE, output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, **kwargs): - query_execution_id = athena_start_query(query, database, output_location, region, workgroup, wait_until_finished=True) +def cursor_execute(query, database=None, cursortype='dict', buffersize=config.BUFFERSIZE, + output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, + **kwargs): + + kwargs['chunksize']=buffersize + df_cur = athena_to_panda(query, database=database, + output_location=output_location, region=region, workgroup=workgroup, + **kwargs) + for df in df_cur: + if cursortype == 'dict': + all_rows = df.where(pd.notnull(df), None).to_dict('records') + if cursortype == 'tuple': + all_rows = df.where(pd.notnull(df), None).itertuples(index=False, name=None) + for row in all_rows: + yield row + + +def athena_to_panda(query, database=None, + output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, + **kwargs): + query_execution_id = athena_start_query(query, database=database, + output_location=output_location, region=region, workgroup=workgroup, + wait_until_finished=True) df = pandas_read_csv(os.path.join(output_location, query_execution_id+'.csv'), **kwargs) return df - -def athena_start_query(query, database=config.DATABASE, output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, wait_until_finished=True): +def athena_start_query(query, database=None, + output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, + wait_until_finished=True): query_execution_id = athena.start_query_execution( QueryString=query, QueryExecutionContext={ @@ -88,13 +110,12 @@ def list_all(path): return listdir(path) -def pandas_read_csv(filepath_or_buffer, verbose=True, **kwargs): +def pandas_read_csv(filepath_or_buffer, **kwargs): bucket, key = seperate_bucket_key(filepath_or_buffer) obj = s3.get_object(Bucket=bucket, Key=key) return pd.read_csv(SomethingIO(obj['Body'].read()), **kwargs) -def read(filename, verbose=True): - log ("Reading {}".format(filename), verbose=verbose) +def read(filename): if is_s3_url(filename): bucket, key = seperate_bucket_key(filename) obj=s3.get_object(Bucket=bucket, Key=key) @@ -107,7 +128,24 @@ def write(body, filename): s3.put_object(Bucket=bucket, Key=key, Body=body) return - + +def file_name_append(filename, append, ommitext): + filename_base, ext = os.path.splitext(filename) + if ommitext: + return '%s%s' % (filename_base, append) + return '%s%s%s' % (filename_base, append, ext) + +def write_many(read_cursor, filename, buffersize=config.BUFFERSIZE): + chunkcount=0 + while True: + buffer_df = pd.DataFrame.from_records(read_cursor, nrows=buffersize) + if buffer_df.empty: + break + buffer = buffer_df.to_csv(index=False, header=False, sep='\t') + chunk_fname = file_name_append(filename, '_{}'.format(chunkcount), ommitext=False) + write(buffer, chunk_fname) + chunkcount += 1 + def file_exists(filename): bucket, key = seperate_bucket_key(filename) try: @@ -120,20 +158,4 @@ def file_exists(filename): raise else: return True - - -def log(outstr, logfile_name=config.LOG_FILE, timestamped=True, verbose=True, quiet=False): - if verbose == False: - return - if timestamped: - outstr = "[%s]\t%s\n" % (str(datetime.datetime.now()) , outstr) - else: - outstr = "%s\n" % (outstr,) - - with open(logfile_name, "a") as logfile: - logfile.write(outstr) - if not quiet: - sys.stdout.write(outstr); - sys.stdout.flush() -# Print iterations progress diff --git a/athena_example/config.py b/athena_example/config.py index 3715b750..f8e4a24b 100644 --- a/athena_example/config.py +++ b/athena_example/config.py @@ -9,4 +9,5 @@ # Database Parameters DATABASE_BUCKET = 'ria-temp' -DATABASE_ROOT_KEY = 'as-dedupe/' +DATABASE_ROOT_KEY = 'as_dedupe/' +BUFFERSIZE = 100000 diff --git a/notebooks/athena_example.ipynb b/notebooks/athena_example.ipynb index da896697..e3a0e7b7 100644 --- a/notebooks/athena_example.ipynb +++ b/notebooks/athena_example.ipynb @@ -1,23 +1,64 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Note: \n", + "Looks good, but check the sanity check notebook to makesure everything is correct" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: dedupe in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (2.0.6)\n", + "Requirement already satisfied: categorical-distance>=1.9 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.9)\n", + "Requirement already satisfied: dedupe-variable-datetime in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (0.1.5)\n", + "Requirement already satisfied: affinegap>=1.3 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.11)\n", + "Requirement already satisfied: highered>=0.2.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (0.2.1)\n", + "Requirement already satisfied: typing-extensions in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (3.7.4.3)\n", + "Requirement already satisfied: simplecosine>=1.2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.2)\n", + "Requirement already satisfied: doublemetaphone in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (0.1)\n", + "Requirement already satisfied: fastcluster in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.1.26)\n", + "Requirement already satisfied: rlr>=2.4.3 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (2.4.5)\n", + "Requirement already satisfied: haversine>=0.4.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (2.3.0)\n", + "Requirement already satisfied: BTrees>=4.1.4 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (4.7.2)\n", + "Requirement already satisfied: numpy>=1.13 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.18.1)\n", + "Requirement already satisfied: zope.index in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (5.0.0)\n", + "Requirement already satisfied: dedupe-hcluster in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (0.3.8)\n", + "Requirement already satisfied: Levenshtein-search in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.4.5)\n", + "Requirement already satisfied: future in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe-variable-datetime->dedupe) (0.18.2)\n", + "Requirement already satisfied: datetime-distance in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe-variable-datetime->dedupe) (0.1.3)\n", + "Requirement already satisfied: pyhacrf-datamade>=0.2.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from highered>=0.2.0->dedupe) (0.2.5)\n", + "Requirement already satisfied: pylbfgs in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from rlr>=2.4.3->dedupe) (0.2.0.13)\n", + "Requirement already satisfied: zope.interface in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from BTrees>=4.1.4->dedupe) (5.1.2)\n", + "Requirement already satisfied: persistent>=4.1.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from BTrees>=4.1.4->dedupe) (4.6.4)\n", + "Requirement already satisfied: six in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from zope.index->dedupe) (1.14.0)\n", + "Requirement already satisfied: setuptools in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from zope.index->dedupe) (45.2.0.post20200210)\n", + "Requirement already satisfied: python-dateutil>=2.6.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from datetime-distance->dedupe-variable-datetime->dedupe) (2.8.1)\n", + "Requirement already satisfied: cffi; platform_python_implementation == \"CPython\" in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from persistent>=4.1.0->BTrees>=4.1.4->dedupe) (1.14.0)\n", + "Requirement already satisfied: pycparser in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from cffi; platform_python_implementation == \"CPython\"->persistent>=4.1.0->BTrees>=4.1.4->dedupe) (2.19)\n", + "\u001b[33mWARNING: You are using pip version 20.0.2; however, version 20.2.4 is available.\n", + "You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" + ] + } + ], "source": [ - "# !pip install dedupe" + "!pip install dedupe" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "# %load ../mysql_example/mysql_example.py\n", - "#!/usr/bin/python\n", - "\n", "\"\"\"\n", "This is an example of working with very large data. There are about\n", "700,000 unduplicated donors in this database of Illinois political\n", @@ -25,7 +66,7 @@ "\n", "With such a large set of input data, we cannot store all the comparisons\n", "we need to make in memory. Instead, we will read the pairs on demand\n", - "from the MySQL database.\n", + "from the Athena database.\n", "\n", "__Note:__ You will need to run `python mysql_init_db.py`\n", "before running this script. See the annotates source for\n", @@ -58,10 +99,10 @@ "sys.path.insert(0, '../athena_example/')\n", "import config\n", "sys.path.insert(0, '../athena_example/')\n", - "import utils\n", + "import athenautils\n", "\n", "def as_pandas(query, **kwrgs):\n", - " df = utils.athena_to_panda(query, escapechar=None, keep_default_na=False, na_values=[''], **kwrgs)\n", + " df = athenautils.athena_to_panda(query, escapechar=None, keep_default_na=False, na_values=[''], **kwrgs)\n", " return df.where(pd.notnull(df), None)\n", "\n", "def record_pairs(result_set):\n", @@ -119,17 +160,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reading from mysql_example_settings\n" + ] + } + ], "source": [ " # We'll be using variations on this following select statement to pull\n", " # in campaign donor info.\n", " #\n", " # We did a fair amount of preprocessing of the fields in\n", " # `mysql_init_db.py` \n", - " DONOR_SELECT = \"SELECT donor_id, city, name, zip, state, address \" \\\n", - " \"from as_processed_donors\"\n", + " DONOR_SELECT = \"\"\"SELECT donor_id, city, name, zip, state, address\n", + " from as_processed_donors\"\"\"\n", "\n", " # ## Training\n", "\n", @@ -155,13 +204,8 @@ " deduper = dedupe.Dedupe(fields, num_cores=4)\n", "\n", " # We will sample pairs from the entire donor table for training\n", - "# with read_con.cursor() as cur:\n", - "\n", - " # Armin: The problem is the donor_id, it's numpy's int64, should be converted to int! \n", - " # But for that, astype doesn't work, and a loop on temp_d is slow, so for now let's just use str\n", - "# with conn.cursor(PandasCursor, schema_name=schema_name) as cursor:\n", - " temp_df = as_pandas(DONOR_SELECT)\n", - " temp_d = temp_df.to_dict('index')\n", + " cur = cur_execute(DONOR_SELECT)\n", + " temp_d = {i: row for i, row in enumerate(cur)}\n", " \n", "\n", " # If we have training data saved from a previous run of dedupe,\n", @@ -209,9 +253,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "blocking...\n", + "creating as_blocking_map database\n" + ] + }, + { + "data": { + "text/plain": [ + "'5651b314-d20b-4404-aa8d-30df70804e0e'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ " # ## Blocking\n", "\n", @@ -220,9 +283,9 @@ " # To run blocking on such a large set of data, we create a separate table\n", " # that contains blocking keys and record ids\n", " print('creating as_blocking_map database')\n", - " utils.athena_start_query(\"DROP TABLE IF EXISTS as_blocking_map\")\n", + " athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_blocking_map\", database=config.DATABASE)\n", "\n", - " q='''\n", + " q=\"\"\"\n", " CREATE EXTERNAL TABLE as_blocking_map \n", " (block_key VARCHAR(200), donor_id INTEGER)\n", " ROW FORMAT DELIMITED\n", @@ -234,15 +297,23 @@ " 'classification'='csv', \n", " --'skip.header.line.count'='1', \n", " 'serialization.null.format'='')\n", - " '''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map') \n", - " utils.athena_start_query(q)" + " \"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map') \n", + " athenautils.athena_start_query(q, database=config.DATABASE)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "creating inverted index\n" + ] + } + ], "source": [ " # If dedupe learned a Index Predicate, we have to take a pass\n", " # through the data and create indices.\n", @@ -251,94 +322,785 @@ " # Armin: \n", " # This never runs, index_fields is empty, possible bug?\n", " for field in deduper.fingerprinter.index_fields:\n", - " q = '''\n", - " SELECT DISTINCT {field} FROM as_processed_donors \n", + " q = \"\"\"\n", + " SELECT DISTINCT {field} FROM as_processed_donors\n", " WHERE {field} IS NOT NULL\n", - " '''.format(field=field)\n", - " cur_df = as_pandas(q)\n", - " # Do I need to cast it as a list?\n", - " field_data = cur_df[field]\n", + " \"\"\".format(field=field)\n", + " cur = cur_execute(q)\n", + " field_data = (row[field] for row in cur)\n", " deduper.fingerprinter.index(field_data, field)\n", " " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "writing blocking map\n" + ] + } + ], "source": [ " # Now we are ready to write our blocking map table by creating a\n", " # generator that yields unique `(block_key, donor_id)` tuples.\n", " print('writing blocking map')\n", " \n", - "\n", - " read_cur_dict = as_pandas(DONOR_SELECT).to_dict('records')\n", - " full_data = ((row['donor_id'], row) for row in read_cur_dict)" + " read_cur = athenautils.cursor_execute(DONOR_SELECT, database=config.DATABASE)\n", + " full_data = ((row['donor_id'], row) for row in read_cur)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_0.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_1.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_2.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_3.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_4.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_5.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_6.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_7.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_8.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_9.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_10.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_11.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_12.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_13.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_14.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_15.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_16.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_17.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_18.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_19.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_20.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_21.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_22.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_23.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_24.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_25.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_26.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_27.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_28.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_29.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_30.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_31.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_32.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_33.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_34.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_35.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_36.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_37.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_38.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_39.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_40.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_41.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_42.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_43.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_44.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_45.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_46.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_47.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_48.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_49.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_50.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_51.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_52.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_53.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_54.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_55.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_56.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_57.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_58.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_59.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_60.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_61.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_62.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_63.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_64.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_65.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_66.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_67.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_68.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_69.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_70.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_71.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_72.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_73.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_74.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_75.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_76.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_77.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_78.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_79.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_80.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_81.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_82.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_83.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_84.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_85.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_86.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_87.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_88.csv\n", + "s3://ria-temp/as_dedupe/as_blocking_map/blocking_89.csv\n" + ] + } + ], "source": [ " b_data = deduper.fingerprinter(full_data)\n", - " buffer = pd.DataFrame.from_records(b_data).to_csv(index=False, header=False, sep='\\t')\n", - " utils.s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'as_blocking_map/blocking.csv', Body=buffer) \n" + " athenautils.write_many(b_data, \n", + " filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY, 'as_blocking_map/blocking.csv'))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "\n", " # select unique pairs to compare\n", - " q='''\n", - " SELECT a.donor_id,\n", - " json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'],\n", - " ARRAY[ a.city, a.name, a.zip, a.state, a.address])\n", - " AS JSON)),\n", - " b.donor_id,\n", - " json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], \n", - " ARRAY[ b.city, b.name, b.zip, b.state, b.address])\n", - " AS JSON))\n", - " FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west\n", - " from as_blocking_map as l\n", - " INNER JOIN as_blocking_map as r\n", - " using (block_key)\n", - " where l.donor_id < r.donor_id) ids\n", - " INNER JOIN as_processed_donors a on ids.east=a.donor_id\n", - " INNER JOIN as_processed_donors b on ids.west=b.donor_id\n", - " '''\n", - " read_cur_dict=as_pandas(q).itertuples(index=False, name=None)" + " q=\"\"\"\n", + " SELECT a.donor_id,\n", + " json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'],\n", + " ARRAY[ a.city, a.name, a.zip, a.state, a.address])\n", + " AS JSON)),\n", + " b.donor_id,\n", + " json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], \n", + " ARRAY[ b.city, b.name, b.zip, b.state, b.address])\n", + " AS JSON))\n", + " FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west\n", + " from as_blocking_map as l\n", + " INNER JOIN as_blocking_map as r\n", + " using (block_key)\n", + " where l.donor_id < r.donor_id) ids\n", + " INNER JOIN as_processed_donors a on ids.east=a.donor_id\n", + " INNER JOIN as_processed_donors b on ids.west=b.donor_id\n", + " \"\"\"\n", + " read_cur = athenautils.cursor_execute(q, cursortype='tuple', database=config.DATABASE)\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "clustering...\n", + "0\n", + "10000\n", + "20000\n", + "30000\n", + "40000\n", + "50000\n", + "60000\n", + "70000\n", + "80000\n", + "90000\n", + "100000\n", + "110000\n", + "120000\n", + "130000\n", + "140000\n", + "150000\n", + "160000\n", + "170000\n", + "180000\n", + "190000\n", + "200000\n", + "210000\n", + "220000\n", + "230000\n", + "240000\n", + "250000\n", + "260000\n", + "270000\n", + "280000\n", + "290000\n", + "300000\n", + "310000\n", + "320000\n", + "330000\n", + "340000\n", + "350000\n", + "360000\n", + "370000\n", + "380000\n", + "390000\n", + "400000\n", + "410000\n", + "420000\n", + "430000\n", + "440000\n", + "450000\n", + "460000\n", + "470000\n", + "480000\n", + "490000\n", + "500000\n", + "510000\n", + "520000\n", + "530000\n", + "540000\n", + "550000\n", + "560000\n", + "570000\n", + "580000\n", + "590000\n", + "600000\n", + "610000\n", + "620000\n", + "630000\n", + "640000\n", + "650000\n", + "660000\n", + "670000\n", + "680000\n", + "690000\n", + "700000\n", + "710000\n", + "720000\n", + "730000\n", + "740000\n", + "750000\n", + "760000\n", + "770000\n", + "780000\n", + "790000\n", + "800000\n", + "810000\n", + "820000\n", + "830000\n", + "840000\n", + "850000\n", + "860000\n", + "870000\n", + "880000\n", + "890000\n", + "900000\n", + "910000\n", + "920000\n", + "930000\n", + "940000\n", + "950000\n", + "960000\n", + "970000\n", + "980000\n", + "990000\n", + "1000000\n", + "1010000\n", + "1020000\n", + "1030000\n", + "1040000\n", + "1050000\n", + "1060000\n", + "1070000\n", + "1080000\n", + "1090000\n", + "1100000\n", + "1110000\n", + "1120000\n", + "1130000\n", + "1140000\n", + "1150000\n", + "1160000\n", + "1170000\n", + "1180000\n", + "1190000\n", + "1200000\n", + "1210000\n", + "1220000\n", + "1230000\n", + "1240000\n", + "1250000\n", + "1260000\n", + "1270000\n", + "1280000\n", + "1290000\n", + "1300000\n", + "1310000\n", + "1320000\n", + "1330000\n", + "1340000\n", + "1350000\n", + "1360000\n", + "1370000\n", + "1380000\n", + "1390000\n", + "1400000\n", + "1410000\n", + "1420000\n", + "1430000\n", + "1440000\n", + "1450000\n", + "1460000\n", + "1470000\n", + "1480000\n", + "1490000\n", + "1500000\n", + "1510000\n", + "1520000\n", + "1530000\n", + "1540000\n", + "1550000\n", + "1560000\n", + "1570000\n", + "1580000\n", + "1590000\n", + "1600000\n", + "1610000\n", + "1620000\n", + "1630000\n", + "1640000\n", + "1650000\n", + "1660000\n", + "1670000\n", + "1680000\n", + "1690000\n", + "1700000\n", + "1710000\n", + "1720000\n", + "1730000\n", + "1740000\n", + "1750000\n", + "1760000\n", + "1770000\n", + "1780000\n", + "1790000\n", + "1800000\n", + "1810000\n", + "1820000\n", + "1830000\n", + "1840000\n", + "1850000\n", + "1860000\n", + "1870000\n", + "1880000\n", + "1890000\n", + "1900000\n", + "1910000\n", + "1920000\n", + "1930000\n", + "1940000\n", + "1950000\n", + "1960000\n", + "1970000\n", + "1980000\n", + "1990000\n", + "2000000\n", + "2010000\n", + "2020000\n", + "2030000\n", + "2040000\n", + "2050000\n", + "2060000\n", + "2070000\n", + "2080000\n", + "2090000\n", + "2100000\n", + "2110000\n", + "2120000\n", + "2130000\n", + "2140000\n", + "2150000\n", + "2160000\n", + "2170000\n", + "2180000\n", + "2190000\n", + "2200000\n", + "2210000\n", + "2220000\n", + "2230000\n", + "2240000\n", + "2250000\n", + "2260000\n", + "2270000\n", + "2280000\n", + "2290000\n", + "2300000\n", + "2310000\n", + "2320000\n", + "2330000\n", + "2340000\n", + "2350000\n", + "2360000\n", + "2370000\n", + "2380000\n", + "2390000\n", + "2400000\n", + "2410000\n", + "2420000\n", + "2430000\n", + "2440000\n", + "2450000\n", + "2460000\n", + "2470000\n", + "2480000\n", + "2490000\n", + "2500000\n", + "2510000\n", + "2520000\n", + "2530000\n", + "2540000\n", + "2550000\n", + "2560000\n", + "2570000\n", + "2580000\n", + "2590000\n", + "2600000\n", + "2610000\n", + "2620000\n", + "2630000\n", + "2640000\n", + "2650000\n", + "2660000\n", + "2670000\n", + "2680000\n", + "2690000\n", + "2700000\n", + "2710000\n", + "2720000\n", + "2730000\n", + "2740000\n", + "2750000\n", + "2760000\n", + "2770000\n", + "2780000\n", + "2790000\n", + "2800000\n", + "2810000\n", + "2820000\n", + "2830000\n", + "2840000\n", + "2850000\n", + "2860000\n", + "2870000\n", + "2880000\n", + "2890000\n", + "2900000\n", + "2910000\n", + "2920000\n", + "2930000\n", + "2940000\n", + "2950000\n", + "2960000\n", + "2970000\n", + "2980000\n", + "2990000\n", + "3000000\n", + "3010000\n", + "3020000\n", + "3030000\n", + "3040000\n", + "3050000\n", + "3060000\n", + "3070000\n", + "3080000\n", + "3090000\n", + "3100000\n", + "3110000\n", + "3120000\n", + "3130000\n", + "3140000\n", + "3150000\n", + "3160000\n", + "3170000\n", + "3180000\n", + "3190000\n", + "3200000\n", + "3210000\n", + "3220000\n", + "3230000\n", + "3240000\n", + "3250000\n", + "3260000\n", + "3270000\n", + "3280000\n", + "3290000\n", + "3300000\n", + "3310000\n", + "3320000\n", + "3330000\n", + "3340000\n", + "3350000\n", + "3360000\n", + "3370000\n", + "3380000\n", + "3390000\n", + "3400000\n", + "3410000\n", + "3420000\n", + "3430000\n", + "3440000\n", + "3450000\n", + "3460000\n", + "3470000\n", + "3480000\n", + "3490000\n", + "3500000\n", + "3510000\n", + "3520000\n", + "3530000\n", + "3540000\n", + "3550000\n", + "3560000\n", + "3570000\n", + "3580000\n", + "3590000\n", + "3600000\n", + "3610000\n", + "3620000\n", + "3630000\n", + "3640000\n", + "3650000\n", + "3660000\n", + "3670000\n", + "3680000\n", + "3690000\n", + "3700000\n", + "3710000\n", + "3720000\n", + "3730000\n", + "3740000\n", + "3750000\n", + "3760000\n", + "3770000\n", + "3780000\n", + "3790000\n", + "3800000\n", + "3810000\n", + "3820000\n", + "3830000\n", + "3840000\n", + "3850000\n", + "3860000\n", + "3870000\n", + "3880000\n", + "3890000\n", + "3900000\n", + "3910000\n", + "3920000\n", + "3930000\n", + "3940000\n", + "3950000\n", + "3960000\n", + "3970000\n", + "3980000\n", + "3990000\n", + "4000000\n", + "4010000\n", + "4020000\n", + "4030000\n", + "4040000\n", + "4050000\n", + "4060000\n", + "4070000\n", + "4080000\n", + "4090000\n", + "4100000\n", + "4110000\n", + "4120000\n", + "4130000\n", + "4140000\n", + "4150000\n", + "4160000\n", + "4170000\n", + "4180000\n", + "4190000\n", + "4200000\n", + "4210000\n", + "4220000\n", + "4230000\n", + "4240000\n", + "4250000\n", + "4260000\n", + "4270000\n", + "4280000\n", + "4290000\n", + "4300000\n", + "4310000\n", + "4320000\n", + "4330000\n", + "4340000\n", + "4350000\n", + "4360000\n", + "4370000\n", + "4380000\n", + "4390000\n", + "4400000\n", + "4410000\n", + "4420000\n", + "4430000\n", + "4440000\n", + "4450000\n", + "4460000\n", + "4470000\n", + "4480000\n", + "4490000\n", + "4500000\n", + "4510000\n", + "4520000\n", + "4530000\n", + "4540000\n", + "4550000\n", + "4560000\n", + "4570000\n", + "4580000\n", + "4590000\n", + "4600000\n", + "4610000\n", + "4620000\n", + "4630000\n", + "4640000\n", + "4650000\n", + "4660000\n", + "4670000\n", + "4680000\n", + "4690000\n", + "4700000\n", + "4710000\n", + "4720000\n", + "4730000\n", + "4740000\n", + "4750000\n", + "4760000\n", + "4770000\n", + "4780000\n", + "4790000\n", + "4800000\n", + "4810000\n", + "4820000\n", + "4830000\n", + "4840000\n", + "4850000\n", + "4860000\n", + "4870000\n", + "4880000\n", + "4890000\n", + "4900000\n", + "4910000\n", + "4920000\n", + "4930000\n", + "4940000\n", + "4950000\n", + "4960000\n", + "4970000\n", + "4980000\n", + "4990000\n", + "5000000\n", + "5010000\n", + "5020000\n", + "5030000\n", + "5040000\n", + "5050000\n", + "5060000\n", + "5070000\n", + "5080000\n", + "5090000\n", + "5100000\n", + "5110000\n", + "5120000\n" + ] + } + ], "source": [ " # ## Clustering\n", "\n", " print('clustering...')\n", - " clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur_dict)),\n", + " clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)),\n", " threshold=0.5)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "creating as_entity_map database\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:dedupe.clustering:A component contained 158378 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 3.048800940974382e-18\n", + "WARNING:dedupe.clustering:A component contained 158378 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.1038451096258602e-17\n", + "WARNING:dedupe.clustering:A component contained 158378 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 4.921472802362437e-17\n", + "WARNING:dedupe.clustering:A component contained 158376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.3509749469939632e-16\n", + "WARNING:dedupe.clustering:A component contained 158376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 3.753198598980096e-16\n", + "WARNING:dedupe.clustering:A component contained 158376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.0403512617910142e-15\n", + "WARNING:dedupe.clustering:A component contained 158376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 2.894390795098561e-15\n", + "WARNING:dedupe.clustering:A component contained 158376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 8.13221083415663e-15\n", + "WARNING:dedupe.clustering:A component contained 158376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 2.2319167959587083e-14\n", + "WARNING:dedupe.clustering:A component contained 158375 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 6.068761194789993e-14\n", + "WARNING:dedupe.clustering:A component contained 158372 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.6520029686587212e-13\n", + "WARNING:dedupe.clustering:A component contained 158364 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 4.5125040046676256e-13\n", + "WARNING:dedupe.clustering:A component contained 158352 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.2269463311618112e-12\n", + "WARNING:dedupe.clustering:A component contained 158314 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 3.3356439660236965e-12\n", + "WARNING:dedupe.clustering:A component contained 157999 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 9.069763004960628e-12\n", + "WARNING:dedupe.clustering:A component contained 157528 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 2.4668471436592435e-11\n", + "WARNING:dedupe.clustering:A component contained 157002 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 6.705726454279488e-11\n", + "WARNING:dedupe.clustering:A component contained 156034 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.822899310600237e-10\n", + "WARNING:dedupe.clustering:A component contained 153167 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 4.955617200886392e-10\n", + "WARNING:dedupe.clustering:A component contained 150749 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.3472511446191333e-09\n", + "WARNING:dedupe.clustering:A component contained 148126 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 3.663451859737113e-09\n", + "WARNING:dedupe.clustering:A component contained 144445 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 9.961619074337575e-09\n", + "WARNING:dedupe.clustering:A component contained 140752 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 2.7079365851019727e-08\n", + "WARNING:dedupe.clustering:A component contained 136821 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 7.361173277021834e-08\n", + "WARNING:dedupe.clustering:A component contained 132985 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 2.00129481181664e-07\n", + "WARNING:dedupe.clustering:A component contained 129188 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 5.440113266301783e-07\n", + "WARNING:dedupe.clustering:A component contained 126461 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.4789049802767608e-06\n", + "WARNING:dedupe.clustering:A component contained 124279 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 4.020875427015852e-06\n", + "WARNING:dedupe.clustering:A component contained 121039 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.0930919387732102e-05\n", + "WARNING:dedupe.clustering:A component contained 117376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 2.971327846301476e-05\n", + "WARNING:dedupe.clustering:A component contained 114455 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 8.076722851404745e-05\n", + "WARNING:dedupe.clustering:A component contained 109969 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.0002195200394847895\n", + "WARNING:dedupe.clustering:A component contained 106867 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.0005965000236037636\n", + "WARNING:dedupe.clustering:A component contained 101488 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.001619959237855584\n", + "WARNING:dedupe.clustering:A component contained 94945 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.004391296209574281\n", + "WARNING:dedupe.clustering:A component contained 89944 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.01184744490841891\n", + "WARNING:dedupe.clustering:A component contained 85759 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.031562819826922474\n", + "WARNING:dedupe.clustering:A component contained 79119 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.08138832068049236\n", + "WARNING:dedupe.clustering:A component contained 73185 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.1940994212134007\n", + "WARNING:dedupe.clustering:A component contained 67046 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.39565940016357204\n", + "WARNING:dedupe.clustering:A component contained 57601 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.6402437741869547\n", + "WARNING:dedupe.clustering:A component contained 36731 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.8286982262391892\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "s3://ria-temp/as_dedupe/as_entity_map/as_entity_map_0.csv\n", + "s3://ria-temp/as_dedupe/as_entity_map/as_entity_map_1.csv\n", + "s3://ria-temp/as_dedupe/as_entity_map/as_entity_map_2.csv\n", + "s3://ria-temp/as_dedupe/as_entity_map/as_entity_map_3.csv\n", + "s3://ria-temp/as_dedupe/as_entity_map/as_entity_map_4.csv\n" + ] + } + ], "source": [ - " utils.athena_start_query(\"DROP TABLE IF EXISTS as_entity_map\")\n", + " athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_entity_map\", database=config.DATABASE)\n", "\n", " print('creating as_entity_map database')\n", - " q='''\n", + " q=\"\"\"\n", " CREATE EXTERNAL TABLE as_entity_map \n", " (donor_id INTEGER, canon_id INTEGER, \n", " cluster_score FLOAT)\n", @@ -351,18 +1113,49 @@ " 'classification'='csv', \n", " --'skip.header.line.count'='1', \n", " 'serialization.null.format'='')\n", - " '''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map') \n", - " utils.athena_start_query(q) \n", + " \"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map') \n", + " athenautils.athena_start_query(q, database=config.DATABASE) \n", "\n", - " buffer = pd.DataFrame.from_records(cluster_ids(clustered_dupes)).to_csv(index=False, header=False, sep='\\t')\n", - " utils.s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'as_entity_map/as_entity_map.csv', Body=buffer) \n" + " athenautils.write_many(cluster_ids(clustered_dupes),\n", + " filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY, 'as_entity_map/as_entity_map.csv'))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# duplicate sets\n", + "Top Donors (deduped)\n", + " $32,146,134.06: democratic party of illinois\n", + " $13,762,181.54: republican state senate campaign committee\n", + " $9,590,682.54: republican governors association\n", + " $9,040,913.46: madigan michael friends of\n", + " $7,949,218.49: seiu healthcare il in pac\n", + " $6,435,815.20: chicago teachers union, ift local 1\n", + " $6,353,463.90: illinois senate democratic fund (the)\n", + " $6,077,259.02: fred eychaner\n", + " $6,022,884.47: scott cohen\n", + " $5,911,667.89: illinois republican party\n", + "Top Donors (raw)\n", + " $14,319,194.47: democratic party of illinois\n", + " $13,020,132.76: democratic party of illinois\n", + " $9,027,432.54: republican governors association\n", + " $7,897,829.31: rga illinois 2010 pac\n", + " $6,675,000.00: madigan michael friends of\n", + " $6,008,841.69: scott cohen\n", + " $5,570,839.00: ronald gidwitz,\n", + " $5,562,800.00: citizens for emil jones\n", + " $5,324,649.63: paul wood,\n", + " $5,132,563.83: seiu healthcare il in\n", + "ran in 3723.114373922348 seconds\n" + ] + } + ], "source": [ " # Print out the number of duplicates found\n", " print('# duplicate sets')\n", @@ -376,60 +1169,58 @@ "\n", " locale.setlocale(locale.LC_ALL, 'en_CA.UTF-8') # for pretty printing numbers\n", " \n", - " utils.athena_start_query(\"DROP TABLE IF EXISTS as_e_map\")\n", - " q = '''\n", - " CREATE TABLE as_e_map as \n", + " athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_e_map\", database=config.DATABASE)\n", + " \n", + " q = \"\"\"\n", + " CREATE TABLE as_e_map as \n", " SELECT COALESCE(canon_id, as_entity_map.donor_id) AS canon_id, as_entity_map.donor_id \n", " FROM as_entity_map \n", - " RIGHT JOIN as_donors USING(donor_id)\n", - " \n", - " '''\n", + " RIGHT JOIN as_donors USING(donor_id) \n", + " \"\"\" \n", + " athenautils.athena_start_query(q, database=config.DATABASE)\n", " \n", - " utils.athena_start_query(q)\n", - " q ='''\n", - " SELECT array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name, \n", - " donation_totals.totals AS totals \n", - " FROM as_donors INNER JOIN \n", - " (SELECT canon_id, SUM(cast (amount as double)) AS totals \n", - " FROM as_contributions INNER JOIN as_e_map \n", - " USING (donor_id) \n", - " GROUP BY (canon_id) \n", - " ORDER BY totals \n", - " DESC LIMIT 10) \n", - " AS donation_totals \n", - " ON as_donors.donor_id = donation_totals.canon_id\n", - " ORDER BY totals DESC\n", - " '''\n", - " cur_dict = as_pandas(q).to_dict('records')\n", + " q = \"\"\"\n", + " SELECT array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name, \n", + " donation_totals.totals AS totals \n", + " FROM as_donors INNER JOIN \n", + " (SELECT canon_id, SUM(cast (amount as double)) AS totals \n", + " FROM as_contributions INNER JOIN as_e_map \n", + " USING (donor_id) \n", + " GROUP BY (canon_id) \n", + " ORDER BY totals \n", + " DESC LIMIT 10) \n", + " AS donation_totals \n", + " ON as_donors.donor_id = donation_totals.canon_id\n", + " ORDER BY totals DESC\n", + " \"\"\"\n", + " cur = athenautils.cursor_execute(q, database=config.DATABASE)\n", "\n", " print(\"Top Donors (deduped)\")\n", - " for row in cur_dict:\n", + " for row in cur:\n", " row['totals'] = locale.currency(row['totals'], grouping=True)\n", " print('%(totals)20s: %(name)s' % row)\n", "\n", " # Compare this to what we would have gotten if we hadn't done any\n", " # deduplication\n", + " q = \"\"\"\n", + " with donorscontributions as(\n", "\n", - " q = '''\n", - " with donorscontributions as(\n", - "\n", - " SELECT as_donors.donor_id, \n", - " array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,\n", - " cast(as_contributions.amount as double) as amount\n", - " FROM as_donors INNER JOIN as_contributions \n", - " USING (donor_id) \n", - " )\n", - " SELECT name, sum(amount) AS totals \n", - " FROM donorscontributions\n", - " GROUP BY donor_id, name\n", - " ORDER BY totals DESC \n", - " LIMIT 10\n", - " '''\n", - "\n", - " cur_dict = as_pandas(q).to_dict('records')\n", + " SELECT as_donors.donor_id, \n", + " array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,\n", + " cast(as_contributions.amount as double) as amount\n", + " FROM as_donors INNER JOIN as_contributions \n", + " USING (donor_id) \n", + " )\n", + " SELECT name, sum(amount) AS totals \n", + " FROM donorscontributions\n", + " GROUP BY donor_id, name\n", + " ORDER BY totals DESC \n", + " LIMIT 10\n", + " \"\"\"\n", + " cur = athenautils.cursor_execute(q, database=config.DATABASE)\n", "\n", " print(\"Top Donors (raw)\")\n", - " for row in cur_dict:\n", + " for row in cur:\n", " row['totals'] = locale.currency(row['totals'], grouping=True)\n", " print('%(totals)20s: %(name)s' % row)\n", "\n", diff --git a/notebooks/athena_init_db.ipynb b/notebooks/athena_init_db.ipynb index 19e6f600..a059f520 100644 --- a/notebooks/athena_init_db.ipynb +++ b/notebooks/athena_init_db.ipynb @@ -2,9 +2,17 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting ../athena_example/config.py\n" + ] + } + ], "source": [ "%%writefile ../athena_example/config.py\n", "LOG_FILE = 'log.txt'\n", @@ -18,12 +26,13 @@ "\n", "# Database Parameters\n", "DATABASE_BUCKET = 'ria-temp'\n", - "DATABASE_ROOT_KEY = 'as-dedupe/'" + "DATABASE_ROOT_KEY = 'as_dedupe/'\n", + "BUFFERSIZE = 100000" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -63,7 +72,7 @@ "import csv\n", "import sys\n", "sys.path.insert(0, '../athena_example/')\n", - "import utils\n", + "import athenautils\n", "\n", "\n", "contributions_zip_file = 'Illinois-campaign-contributions.txt.zip'\n", @@ -89,14 +98,14 @@ "\n", "\n", "print('importing raw data from csv...')\n", - "utils.athena_start_query(\"DROP TABLE IF EXISTS as_raw_table\")\n", - "utils.athena_start_query(\"DROP TABLE IF EXISTS as_donors\")\n", - "utils.athena_start_query(\"DROP TABLE IF EXISTS as_recipients\")\n", - "utils.athena_start_query(\"DROP TABLE IF EXISTS as_contributions\")\n", - "utils.athena_start_query(\"DROP TABLE IF EXISTS as_processed_donors\")\n", + "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_raw_table\", database=config.DATABASE)\n", + "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_donors\", database=config.DATABASE)\n", + "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_recipients\", database=config.DATABASE)\n", + "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_contributions\", database=config.DATABASE)\n", + "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_processed_donors\", database=config.DATABASE)\n", "\n", "\n", - "q=r'''\n", + "q=r\"\"\"\n", "CREATE EXTERNAL TABLE as_raw_table \n", " (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), \n", " address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), \n", @@ -121,8 +130,8 @@ " 'classification'='csv', \n", " 'skip.header.line.count'='1', \n", " 'serialization.null.format'='')\n", - "'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table') \n", - "utils.athena_start_query(q)\n", + "\"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table') \n", + "athenautils.athena_start_query(q, database=config.DATABASE)\n", "\n", "\n", "df = pd.read_csv(contributions_txt_file, sep='\\t', escapechar='\\\\', quoting=csv.QUOTE_NONE, \n", @@ -152,14 +161,14 @@ "# df = df.replace(r'^\\s*$', np.nan, regex=True)\n", "df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand')\n", "\n", - "utils.write(body=df_lower.to_csv(quoting=csv.QUOTE_NONE, sep=\"\\t\", escapechar='\\\\', index=None),\n", + "athenautils.write(body=df_lower.to_csv(quoting=csv.QUOTE_NONE, sep=\"\\t\", escapechar='\\\\', index=None),\n", " filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'as_raw_table', contributions_txt_file,))\n", "\n", "# Athena is doesn't equate empty string and null, eventhough in the table spec we said so\n", "# Not that it's a bug, it works if the string is null in the source, but not after applying trim to it\n", "# So we need to manually take care of that\n", "print('creating donors table...')\n", - "q='''\n", + "q=\"\"\"\n", "CREATE TABLE as_donors as\n", " with tmp as\n", " (SELECT DISTINCT \n", @@ -173,15 +182,15 @@ " NULLIF(TRIM(employer), '') as employer, \n", " NULLIF(TRIM(occupation), '') as occupation\n", " FROM as_raw_table)\n", - " SELECT row_number() over () as donor_id, * from tmp'''\n", - "utils.athena_start_query(q)\n", + " SELECT row_number() over () as donor_id, * from tmp\"\"\"\n", + "athenautils.athena_start_query(q, database=config.DATABASE)\n", "\n", "\n", - "q='''\n", + "q=\"\"\"\n", "CREATE TABLE as_recipients as\n", " SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM as_raw_table\n", - "'''\n", - "utils.athena_start_query(q)\n", + "\"\"\"\n", + "athenautils.athena_start_query(q, database=config.DATABASE)\n", "\n", "print('creating contributions table')\n", "\n", @@ -201,7 +210,7 @@ "# \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n", "# --\n", "\n", - "q='''\n", + "q=\"\"\"\n", "CREATE TABLE as_contributions as\n", " SELECT reciept_id as contribution_id, \n", " donors.donor_id as donor_id , \n", @@ -223,11 +232,11 @@ " coalesce(donors.state, '') = coalesce(TRIM(as_raw_table.state), '') AND \n", " coalesce(donors.employer, '') = coalesce(TRIM(as_raw_table.employer), '') AND \n", " coalesce(donors.occupation , '')= coalesce(TRIM(as_raw_table.occupation), '') AND \n", - " coalesce(donors.zip, '') = coalesce(TRIM(as_raw_table.zip), '')'''\n", + " coalesce(donors.zip, '') = coalesce(TRIM(as_raw_table.zip), '')\"\"\"\n", "\n", - "utils.athena_start_query(q)\n", + "athenautils.athena_start_query(q, database=config.DATABASE)\n", "\n", - "q = '''\n", + "q = \"\"\"\n", "CREATE TABLE as_processed_donors AS \n", " SELECT donor_id, \n", " LOWER(city) AS city, \n", @@ -244,8 +253,8 @@ " LOWER(occupation) AS occupation, \n", " LOWER(employer) AS employer, \n", " first_name is null AS person \n", - " FROM as_donors'''\n", - "utils.athena_start_query(q)\n", + " FROM as_donors\"\"\"\n", + "athenautils.athena_start_query(q, database=config.DATABASE)\n", "\n", "\n", "\n", @@ -255,9 +264,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "importing raw data from csv...\n", + "b'Skipping line 1441352: expected 30 fields, saw 31\\n'\n", + "b'Skipping line 1465996: expected 30 fields, saw 31\\n'\n", + "b'Skipping line 1495732: expected 30 fields, saw 31\\n'\n", + "b'Skipping line 1631504: expected 30 fields, saw 31\\nSkipping line 1631506: expected 30 fields, saw 31\\n'\n", + "b'Skipping line 1660260: expected 30 fields, saw 31\\nSkipping line 1660264: expected 30 fields, saw 32\\n'\n", + "creating donors table...\n", + "creating contributions table\n", + "done\n" + ] + } + ], "source": [ "!python ../athena_example/athena_init.py" ] From fbfb323aaca448b68bca1fd16836c1177f8248b7 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Tue, 3 Nov 2020 21:02:51 +0000 Subject: [PATCH 10/19] checkpoint --- athena_example/README.md | 2 +- athena_example/athena_example.py | 247 ++++---- athena_example/athena_init.py | 87 ++- athena_example/athenautils.py | 28 +- athena_example/config.py | 1 + notebooks/athena_example.ipynb | 934 +++---------------------------- notebooks/athena_init_db.ipynb | 148 ++--- 7 files changed, 315 insertions(+), 1132 deletions(-) diff --git a/athena_example/README.md b/athena_example/README.md index 3530935d..53442a12 100644 --- a/athena_example/README.md +++ b/athena_example/README.md @@ -13,7 +13,7 @@ Once that's all done you can run the example: ```bash cd mysql_example -python athena_init_db.py +python athena_init.py python athena_example.py ``` diff --git a/athena_example/athena_example.py b/athena_example/athena_example.py index a738c56c..384172f2 100644 --- a/athena_example/athena_example.py +++ b/athena_example/athena_example.py @@ -4,9 +4,6 @@ # In[ ]: -# %load ../mysql_example/mysql_example.py -#!/usr/bin/python - """ This is an example of working with very large data. There are about 700,000 unduplicated donors in this database of Illinois political @@ -14,7 +11,7 @@ With such a large set of input data, we cannot store all the comparisons we need to make in memory. Instead, we will read the pairs on demand -from the MySQL database. +from the Athena database. __Note:__ You will need to run `python mysql_init_db.py` before running this script. See the annotates source for @@ -47,12 +44,23 @@ sys.path.insert(0, '../athena_example/') import config sys.path.insert(0, '../athena_example/') -import utils +import athenautils -def as_pandas(query, **kwrgs): - df = utils.athena_to_panda(query, escapechar=None, keep_default_na=False, na_values=[''], **kwrgs) - return df.where(pd.notnull(df), None) +def cursor_execute(query, database): + ''' + The MySQL compatible Cursor + ''' + return athenautils.cursor_execute(query, database=database, + cursortype='tuple', buffersize=config.BUFFERSIZE, + escapechar=None, keep_default_na=False, na_values=['']) +def dict_cursor_execute(query, database): + ''' + The MySQL compatible DicCursor + ''' + return athenautils.cursor_execute(query, database=database, + cursortype='dict', buffersize=config.BUFFERSIZE, + escapechar=None, keep_default_na=False, na_values=['']) def record_pairs(result_set): for i, row in enumerate(result_set): a_record_id, a_record, b_record_id, b_record = row @@ -75,7 +83,7 @@ def cluster_ids(clustered_dupes): if __name__ == '__main__': - # ## Logging + ## Logging # Dedupe uses Python logging to show or suppress verbose output. Added # for convenience. To enable verbose output, run `python @@ -113,7 +121,8 @@ def cluster_ids(clustered_dupes): # # We did a fair amount of preprocessing of the fields in # `mysql_init_db.py` -DONOR_SELECT = "SELECT donor_id, city, name, zip, state, address " "from processed_donors" +DONOR_SELECT = """SELECT donor_id, city, name, zip, state, address + from as_processed_donors""" # ## Training @@ -139,13 +148,8 @@ def cluster_ids(clustered_dupes): deduper = dedupe.Dedupe(fields, num_cores=4) # We will sample pairs from the entire donor table for training -# with read_con.cursor() as cur: - - # Armin: The problem is the donor_id, it's numpy's int64, should be converted to int! - # But for that, astype doesn't work, and a loop on temp_d is slow, so for now let's just use str -# with conn.cursor(PandasCursor, schema_name=schema_name) as cursor: - temp_df = as_pandas(DONOR_SELECT) - temp_d = temp_df.to_dict('index') + cur = cur_execute(DONOR_SELECT) + temp_d = {i: row for i, row in enumerate(cur)} # If we have training data saved from a previous run of dedupe, @@ -200,11 +204,13 @@ def cluster_ids(clustered_dupes): # To run blocking on such a large set of data, we create a separate table # that contains blocking keys and record ids -print('creating blocking_map database') -utils.athena_start_query("DROP TABLE IF EXISTS blocking_map") +print('creating as_blocking_map database') +athenautils.drop_external_table("as_blocking_map", + location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map'), + database=config.DATABASE) -q=''' -CREATE EXTERNAL TABLE blocking_map +q=""" +CREATE EXTERNAL TABLE as_blocking_map (block_key VARCHAR(200), donor_id INTEGER) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' @@ -215,8 +221,8 @@ def cluster_ids(clustered_dupes): 'classification'='csv', --'skip.header.line.count'='1', 'serialization.null.format'='') -'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'blocking_map') -utils.athena_start_query(q) +""".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map') +athenautils.athena_start_query(q, database=config.DATABASE) # In[ ]: @@ -229,13 +235,12 @@ def cluster_ids(clustered_dupes): # Armin: # This never runs, index_fields is empty, possible bug? for field in deduper.fingerprinter.index_fields: - q = ''' - SELECT DISTINCT {field} FROM processed_donors + q = """ + SELECT DISTINCT {field} FROM as_processed_donors WHERE {field} IS NOT NULL - '''.format(field=field) - cur_df = as_pandas(q) - # Do I need to cast it as a list? - field_data = cur_df[field] + """.format(field=field) + cur = cur_execute(q) + field_data = (row[field] for row in cur) deduper.fingerprinter.index(field_data, field) @@ -247,16 +252,16 @@ def cluster_ids(clustered_dupes): # generator that yields unique `(block_key, donor_id)` tuples. print('writing blocking map') - -read_cur_dict = as_pandas(DONOR_SELECT).to_dict('records') -full_data = ((row['donor_id'], row) for row in read_cur_dict) +read_cur = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE) +full_data = ((row['donor_id'], row) for row in read_cur) # In[ ]: b_data = deduper.fingerprinter(full_data) -buffer = pd.DataFrame.from_records(b_data).to_csv(index=False, header=False, sep='\t') utils.s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'blocking_map/blocking.csv', Body=buffer) +athenautils.write_many(b_data, + filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map/blocking.csv')) # In[ ]: @@ -264,24 +269,24 @@ def cluster_ids(clustered_dupes): # select unique pairs to compare - q=''' - SELECT a.donor_id, - json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], - ARRAY[ a.city, a.name, a.zip, a.state, a.address]) - AS JSON)), - b.donor_id, - json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], - ARRAY[ b.city, b.name, b.zip, b.state, b.address]) - AS JSON)) - FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west - from blocking_map as l - INNER JOIN blocking_map as r - using (block_key) - where l.donor_id < r.donor_id) ids - INNER JOIN processed_donors a on ids.east=a.donor_id - INNER JOIN processed_donors b on ids.west=b.donor_id - ''' - read_cur_dict=as_pandas(q).itertuples(index=False, name=None) + q=""" + SELECT a.donor_id, + json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], + ARRAY[ a.city, a.name, a.zip, a.state, a.address]) + AS JSON)), + b.donor_id, + json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], + ARRAY[ b.city, b.name, b.zip, b.state, b.address]) + AS JSON)) + FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west + from as_blocking_map as l + INNER JOIN as_blocking_map as r + using (block_key) + where l.donor_id < r.donor_id) ids + INNER JOIN as_processed_donors a on ids.east=a.donor_id + INNER JOIN as_processed_donors b on ids.west=b.donor_id + """ + read_cur = cursor_execute(q, database=config.DATABASE) # In[ ]: @@ -290,34 +295,37 @@ def cluster_ids(clustered_dupes): # ## Clustering print('clustering...') -clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur_dict)), +clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)), threshold=0.5) # In[ ]: -utils.athena_start_query("DROP TABLE IF EXISTS entity_map") - -print('creating entity_map database') -q=''' -CREATE EXTERNAL TABLE entity_map - (donor_id INTEGER, canon_id INTEGER, - cluster_score FLOAT) -ROW FORMAT DELIMITED - FIELDS TERMINATED BY '\t' - LINES TERMINATED BY '\n' -LOCATION - 's3://{}/{}' -TBLPROPERTIES ( - 'classification'='csv', - --'skip.header.line.count'='1', - 'serialization.null.format'='') -'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'entity_map') -utils.athena_start_query(q) - -buffer = pd.DataFrame.from_records(cluster_ids(clustered_dupes)).to_csv(index=False, header=False, sep='\t') -utils.s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'entity_map/entity_map.csv', Body=buffer) +# athenautils.athena_start_query("DROP TABLE IF EXISTS as_entity_map", database=config.DATABASE) + athenautils.drop_external_table("as_entity_map", + location='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/'), + database=config.DATABASE) + + print('creating as_entity_map database') + q=""" + CREATE EXTERNAL TABLE as_entity_map + (donor_id INTEGER, canon_id INTEGER, + cluster_score FLOAT) + ROW FORMAT DELIMITED + FIELDS TERMINATED BY '\t' + LINES TERMINATED BY '\n' + LOCATION + 's3://{}/{}' + TBLPROPERTIES ( + 'classification'='csv', + --'skip.header.line.count'='1', + 'serialization.null.format'='') + """.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map') + athenautils.athena_start_query(q, database=config.DATABASE) + + athenautils.write_many(cluster_ids(clustered_dupes), + filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/entity_map.csv')) # In[ ]: @@ -335,70 +343,65 @@ def cluster_ids(clustered_dupes): locale.setlocale(locale.LC_ALL, 'en_CA.UTF-8') # for pretty printing numbers -utils.athena_start_query("DROP TABLE IF EXISTS e_map") -q = ''' -CREATE TABLE e_map as - SELECT COALESCE(canon_id, entity_map.donor_id) AS canon_id, entity_map.donor_id - FROM entity_map - RIGHT JOIN donors USING(donor_id) -''' - -utils.athena_start_query(q) -q =''' -SELECT array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name, - donation_totals.totals AS totals -FROM donors INNER JOIN - (SELECT canon_id, SUM(cast (amount as double)) AS totals - FROM contributions INNER JOIN e_map - USING (donor_id) - GROUP BY (canon_id) - ORDER BY totals - DESC LIMIT 10) - AS donation_totals -ON donors.donor_id = donation_totals.canon_id -ORDER BY totals DESC -''' -cur_dict = as_pandas(q).to_dict('records') +athenautils.athena_start_query("DROP TABLE IF EXISTS as_e_map", database=config.DATABASE) + +q = """ + CREATE TABLE as_e_map as + SELECT COALESCE(canon_id, as_entity_map.donor_id) AS canon_id, as_entity_map.donor_id + FROM as_entity_map + RIGHT JOIN as_donors USING(donor_id) + """ +athenautils.athena_start_query(q, database=config.DATABASE) + +q = """ + SELECT array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name, + donation_totals.totals AS totals + FROM as_donors INNER JOIN + (SELECT canon_id, SUM(cast (amount as double)) AS totals + FROM as_contributions INNER JOIN as_e_map + USING (donor_id) + GROUP BY (canon_id) + ORDER BY totals + DESC LIMIT 10) + AS donation_totals + ON as_donors.donor_id = donation_totals.canon_id + ORDER BY totals DESC +""" +cur = dict_cursor_execute(q, database=config.DATABASE) print("Top Donors (deduped)") -for row in cur_dict: +for row in cur: row['totals'] = locale.currency(row['totals'], grouping=True) print('%(totals)20s: %(name)s' % row) # Compare this to what we would have gotten if we hadn't done any # deduplication - -q = ''' -with donorscontributions as( - - SELECT donors.donor_id, - array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name, - cast(contributions.amount as double) as amount - FROM donors INNER JOIN contributions - USING (donor_id) -) -SELECT name, sum(amount) AS totals -FROM donorscontributions -GROUP BY donor_id, name -ORDER BY totals DESC -LIMIT 10 -''' - -cur_dict = as_pandas(q).to_dict('records') +q = """ + with donorscontributions as( + + SELECT as_donors.donor_id, + array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name, + cast(as_contributions.amount as double) as amount + FROM as_donors INNER JOIN as_contributions + USING (donor_id) + ) + SELECT name, sum(amount) AS totals + FROM donorscontributions + GROUP BY donor_id, name + ORDER BY totals DESC + LIMIT 10 +""" +cur = dict_cursor_execute(q, database=config.DATABASE) print("Top Donors (raw)") -for row in cur_dict: +for row in cur: row['totals'] = locale.currency(row['totals'], grouping=True) print('%(totals)20s: %(name)s' % row) -# Close our database connection -# read_con.close() -# write_con.close() - print('ran in', time.time() - start_time, 'seconds') -# In[9]: +# In[ ]: get_ipython().system('jupyter nbconvert --to script athena_example.ipynb --output-dir=../athena_example/') diff --git a/athena_example/athena_init.py b/athena_example/athena_init.py index c8b5b3ea..45a5e254 100644 --- a/athena_example/athena_init.py +++ b/athena_example/athena_init.py @@ -51,7 +51,9 @@ print('importing raw data from csv...') -athenautils.athena_start_query("DROP TABLE IF EXISTS as_raw_table", database=config.DATABASE) +athenautils.drop_external_table("as_raw_table", + location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table'), + database=config.DATABASE) athenautils.athena_start_query("DROP TABLE IF EXISTS as_donors", database=config.DATABASE) athenautils.athena_start_query("DROP TABLE IF EXISTS as_recipients", database=config.DATABASE) athenautils.athena_start_query("DROP TABLE IF EXISTS as_contributions", database=config.DATABASE) @@ -87,39 +89,40 @@ athenautils.athena_start_query(q, database=config.DATABASE) -df = pd.read_csv(contributions_txt_file, sep='\t', escapechar='\\', quoting=csv.QUOTE_NONE, - error_bad_lines=False, warn_bad_lines=True, dtype=str, keep_default_na=False, na_values=[''])#, - -# Remove the very few records that mess up the demo -# (demo purposes only! Don't do something like this in production) -df = df[df['RcvDate'].str.len()>=10] - -# set empty, non-zero, strings in date columns to null -df.loc[df['RptPdBegDate'].str.len()<10,'RptPdBegDate'] = np.nan - -df.loc[df['RptPdEndDate'].str.len()<10,'RptPdEndDate'] = np.nan - -#committee ID is requred. Remove the 2 rows that don't have it. -df = df[df['ID']!=''] - -# There's a record with a date stuck in the committee_id column, which causes -# problems when inserting into the contributions table below. Get rid of it this -# way. -df = df[df['ID'].str.len() <=9] - -# dropping the last columns -df = df.drop(columns='Unnamed: 29') - -# Nullifying empty strings -# df = df.replace(r'^\s*$', np.nan, regex=True) -df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand') - -athenautils.write(body=df_lower.to_csv(quoting=csv.QUOTE_NONE, sep="\t", escapechar='\\', index=None), - filename=os.path.join("s3://", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'as_raw_table', contributions_txt_file,)) - -# Athena is doesn't equate empty string and null, eventhough in the table spec we said so -# Not that it's a bug, it works if the string is null in the source, but not after applying trim to it -# So we need to manually take care of that +df_cursor = pd.read_csv(contributions_txt_file, sep='\t', escapechar='\\', quoting=csv.QUOTE_NONE, + error_bad_lines=False, warn_bad_lines=True, dtype=str, keep_default_na=False, na_values=[''], + chunksize=config.BUFFERSIZE) +chunkcount = 0 +filename=os.path.join("s3://", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'as_raw_table', os.path.splitext(contributions_txt_file)[0]+'.csv') +for df in df_cursor: + # Remove the very few records that mess up the demo + # (demo purposes only! Don't do something like this in production) + df = df[df['RcvDate'].str.len()>=10] + + # set empty, non-zero, strings in date columns to null + df.loc[df['RptPdBegDate'].str.len()<10,'RptPdBegDate'] = np.nan + + df.loc[df['RptPdEndDate'].str.len()<10,'RptPdEndDate'] = np.nan + + #committee ID is requred. Remove the 2 rows that don't have it. + df = df[df['ID']!=''] + + # There's a record with a date stuck in the committee_id column, which causes + # problems when inserting into the contributions table below. Get rid of it this + # way. + df = df[df['ID'].str.len() <=9] + + # dropping the last columns + df = df.drop(columns='Unnamed: 29') + + df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand') + + buffer = df_lower.to_csv(quoting=csv.QUOTE_NONE, sep="\t", escapechar='\\', index=None) + + chunk_fname = athenautils.file_name_append(filename, '_{}'.format(chunkcount), ommitext=False) + athenautils.write(body=buffer, filename=chunk_fname) + chunkcount += 1 + print('creating donors table...') q=""" CREATE TABLE as_donors as @@ -147,22 +150,6 @@ print('creating contributions table') -# -- -# c.execute("CREATE TABLE contributions " -# "(contribution_id INT, donor_id INT, recipient_id INT, " -# " report_type VARCHAR(24), date_recieved DATE, " -# " loan_amount VARCHAR(12), amount VARCHAR(23), " -# " receipt_type VARCHAR(23), " -# " vendor_last_name VARCHAR(70), " -# " vendor_first_name VARCHAR(20), " -# " vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), " -# " vendor_city VARCHAR(20), vendor_state VARCHAR(10), " -# " vendor_zip VARCHAR(10), description VARCHAR(90), " -# " election_type VARCHAR(10), election_year VARCHAR(10), " -# " report_period_begin DATE, report_period_end DATE) " -# "CHARACTER SET utf8 COLLATE utf8_unicode_ci") -# -- - q=""" CREATE TABLE as_contributions as SELECT reciept_id as contribution_id, diff --git a/athena_example/athenautils.py b/athena_example/athenautils.py index 3cd8e4dd..a88463e2 100644 --- a/athena_example/athenautils.py +++ b/athena_example/athenautils.py @@ -27,7 +27,7 @@ athena = boto3.client('athena', region_name=config.REGION, aws_access_key_id=config.ACCESS_KEY_ID, aws_secret_access_key=config.SECRET_ACCESS_KEY) -def cursor_execute(query, database=None, cursortype='dict', buffersize=config.BUFFERSIZE, +def cursor_execute(query, database=None, cursortype='tuple', buffersize=1000000, output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, **kwargs): @@ -104,10 +104,36 @@ def list_all(path): if is_s3_url(path): bucket, key = seperate_bucket_key(path) objects = s3.list_objects_v2(Bucket=bucket, Prefix=key) + if not 'Contents' in objects: + return [] return [key['Key'] for key in objects['Contents']] from os import listdir from os.path import isfile, join + if not os.path.exists(path): + return [] return listdir(path) + +def del_all_files(path): + filelist = list_all(path) + if is_s3_url(path): + bucket, key = seperate_bucket_key(path) + for f in filelist: + s3.delete_object(Bucket=bucket, Key=f) + return + filelist = [os.path.join(path, f) for f in filelist] + for f in filelist: + if os.path.isfile(f): + os.remove(f) + else: + shutil.rmtree(f) + +def drop_external_table(tablename, location , database=None, + output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP): + athena_start_query('drop table if exists {}'.format(tablename), database=database, + output_location=output_location, region=region, workgroup=workgroup) + del_all_files(location) + + def pandas_read_csv(filepath_or_buffer, **kwargs): diff --git a/athena_example/config.py b/athena_example/config.py index f8e4a24b..9808a709 100644 --- a/athena_example/config.py +++ b/athena_example/config.py @@ -1,4 +1,5 @@ LOG_FILE = 'log.txt' + # Connection parameters ACCESS_KEY_ID = None SECRET_ACCESS_KEY = None diff --git a/notebooks/athena_example.ipynb b/notebooks/athena_example.ipynb index e3a0e7b7..e452939c 100644 --- a/notebooks/athena_example.ipynb +++ b/notebooks/athena_example.ipynb @@ -1,61 +1,8 @@ { "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Note: \n", - "Looks good, but check the sanity check notebook to makesure everything is correct" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: dedupe in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (2.0.6)\n", - "Requirement already satisfied: categorical-distance>=1.9 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.9)\n", - "Requirement already satisfied: dedupe-variable-datetime in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (0.1.5)\n", - "Requirement already satisfied: affinegap>=1.3 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.11)\n", - "Requirement already satisfied: highered>=0.2.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (0.2.1)\n", - "Requirement already satisfied: typing-extensions in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (3.7.4.3)\n", - "Requirement already satisfied: simplecosine>=1.2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.2)\n", - "Requirement already satisfied: doublemetaphone in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (0.1)\n", - "Requirement already satisfied: fastcluster in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.1.26)\n", - "Requirement already satisfied: rlr>=2.4.3 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (2.4.5)\n", - "Requirement already satisfied: haversine>=0.4.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (2.3.0)\n", - "Requirement already satisfied: BTrees>=4.1.4 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (4.7.2)\n", - "Requirement already satisfied: numpy>=1.13 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.18.1)\n", - "Requirement already satisfied: zope.index in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (5.0.0)\n", - "Requirement already satisfied: dedupe-hcluster in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (0.3.8)\n", - "Requirement already satisfied: Levenshtein-search in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.4.5)\n", - "Requirement already satisfied: future in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe-variable-datetime->dedupe) (0.18.2)\n", - "Requirement already satisfied: datetime-distance in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe-variable-datetime->dedupe) (0.1.3)\n", - "Requirement already satisfied: pyhacrf-datamade>=0.2.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from highered>=0.2.0->dedupe) (0.2.5)\n", - "Requirement already satisfied: pylbfgs in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from rlr>=2.4.3->dedupe) (0.2.0.13)\n", - "Requirement already satisfied: zope.interface in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from BTrees>=4.1.4->dedupe) (5.1.2)\n", - "Requirement already satisfied: persistent>=4.1.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from BTrees>=4.1.4->dedupe) (4.6.4)\n", - "Requirement already satisfied: six in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from zope.index->dedupe) (1.14.0)\n", - "Requirement already satisfied: setuptools in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from zope.index->dedupe) (45.2.0.post20200210)\n", - "Requirement already satisfied: python-dateutil>=2.6.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from datetime-distance->dedupe-variable-datetime->dedupe) (2.8.1)\n", - "Requirement already satisfied: cffi; platform_python_implementation == \"CPython\" in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from persistent>=4.1.0->BTrees>=4.1.4->dedupe) (1.14.0)\n", - "Requirement already satisfied: pycparser in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from cffi; platform_python_implementation == \"CPython\"->persistent>=4.1.0->BTrees>=4.1.4->dedupe) (2.19)\n", - "\u001b[33mWARNING: You are using pip version 20.0.2; however, version 20.2.4 is available.\n", - "You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" - ] - } - ], - "source": [ - "!pip install dedupe" - ] - }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -101,10 +48,21 @@ "sys.path.insert(0, '../athena_example/')\n", "import athenautils\n", "\n", - "def as_pandas(query, **kwrgs):\n", - " df = athenautils.athena_to_panda(query, escapechar=None, keep_default_na=False, na_values=[''], **kwrgs)\n", - " return df.where(pd.notnull(df), None)\n", - "\n", + "def cursor_execute(query, database):\n", + " '''\n", + " The MySQL compatible Cursor\n", + " '''\n", + " return athenautils.cursor_execute(query, database=database, \n", + " cursortype='tuple', buffersize=config.BUFFERSIZE,\n", + " escapechar=None, keep_default_na=False, na_values=[''])\n", + "\n", + "def dict_cursor_execute(query, database):\n", + " '''\n", + " The MySQL compatible DicCursor\n", + " '''\n", + " return athenautils.cursor_execute(query, database=database, \n", + " cursortype='dict', buffersize=config.BUFFERSIZE,\n", + " escapechar=None, keep_default_na=False, na_values=[''])\n", "def record_pairs(result_set):\n", " for i, row in enumerate(result_set):\n", " a_record_id, a_record, b_record_id, b_record = row\n", @@ -125,26 +83,25 @@ " yield donor_id, cluster_id, score\n", "\n", "\n", - "# if __name__ == '__main__':\n", - "if True:\n", + "if __name__ == '__main__':\n", "\n", - " # ## Logging\n", + " ## Logging\n", "\n", " # Dedupe uses Python logging to show or suppress verbose output. Added\n", " # for convenience. To enable verbose output, run `python\n", " # examples/mysql_example/mysql_example.py -v`\n", " \n", - "# optp = optparse.OptionParser()\n", - "# optp.add_option('-v', '--verbose', dest='verbose', action='count',\n", - "# help='Increase verbosity (specify multiple times for more)'\n", - "# )\n", - "# (opts, args) = optp.parse_args()\n", + " optp = optparse.OptionParser()\n", + " optp.add_option('-v', '--verbose', dest='verbose', action='count',\n", + " help='Increase verbosity (specify multiple times for more)'\n", + " )\n", + " (opts, args) = optp.parse_args()\n", " log_level = logging.WARNING\n", - "# if opts.verbose:\n", - "# if opts.verbose == 1:\n", - "# log_level = logging.INFO\n", - "# elif opts.verbose >= 2:\n", - "# log_level = logging.DEBUG\n", + " if opts.verbose:\n", + " if opts.verbose == 1:\n", + " log_level = logging.INFO\n", + " elif opts.verbose >= 2:\n", + " log_level = logging.DEBUG\n", "\n", "\n", " logging.getLogger().setLevel(log_level)\n", @@ -160,17 +117,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "reading from mysql_example_settings\n" - ] - } - ], + "outputs": [], "source": [ " # We'll be using variations on this following select statement to pull\n", " # in campaign donor info.\n", @@ -253,28 +202,9 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "blocking...\n", - "creating as_blocking_map database\n" - ] - }, - { - "data": { - "text/plain": [ - "'5651b314-d20b-4404-aa8d-30df70804e0e'" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ " # ## Blocking\n", "\n", @@ -283,7 +213,9 @@ " # To run blocking on such a large set of data, we create a separate table\n", " # that contains blocking keys and record ids\n", " print('creating as_blocking_map database')\n", - " athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_blocking_map\", database=config.DATABASE)\n", + " athenautils.drop_external_table(\"as_blocking_map\", \n", + " location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map'),\n", + " database=config.DATABASE)\n", "\n", " q=\"\"\"\n", " CREATE EXTERNAL TABLE as_blocking_map \n", @@ -303,17 +235,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "creating inverted index\n" - ] - } - ], + "outputs": [], "source": [ " # If dedupe learned a Index Predicate, we have to take a pass\n", " # through the data and create indices.\n", @@ -334,137 +258,32 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "writing blocking map\n" - ] - } - ], + "outputs": [], "source": [ " # Now we are ready to write our blocking map table by creating a\n", " # generator that yields unique `(block_key, donor_id)` tuples.\n", " print('writing blocking map')\n", " \n", - " read_cur = athenautils.cursor_execute(DONOR_SELECT, database=config.DATABASE)\n", + " read_cur = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE)\n", " full_data = ((row['donor_id'], row) for row in read_cur)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_0.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_1.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_2.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_3.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_4.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_5.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_6.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_7.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_8.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_9.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_10.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_11.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_12.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_13.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_14.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_15.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_16.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_17.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_18.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_19.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_20.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_21.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_22.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_23.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_24.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_25.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_26.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_27.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_28.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_29.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_30.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_31.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_32.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_33.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_34.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_35.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_36.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_37.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_38.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_39.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_40.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_41.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_42.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_43.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_44.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_45.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_46.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_47.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_48.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_49.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_50.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_51.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_52.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_53.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_54.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_55.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_56.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_57.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_58.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_59.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_60.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_61.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_62.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_63.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_64.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_65.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_66.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_67.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_68.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_69.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_70.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_71.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_72.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_73.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_74.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_75.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_76.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_77.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_78.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_79.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_80.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_81.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_82.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_83.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_84.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_85.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_86.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_87.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_88.csv\n", - "s3://ria-temp/as_dedupe/as_blocking_map/blocking_89.csv\n" - ] - } - ], + "outputs": [], "source": [ " b_data = deduper.fingerprinter(full_data)\n", " athenautils.write_many(b_data, \n", - " filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY, 'as_blocking_map/blocking.csv'))" + " filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map/blocking.csv'))" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -487,535 +306,14 @@ " INNER JOIN as_processed_donors a on ids.east=a.donor_id\n", " INNER JOIN as_processed_donors b on ids.west=b.donor_id\n", " \"\"\"\n", - " read_cur = athenautils.cursor_execute(q, cursortype='tuple', database=config.DATABASE)\n" + " read_cur = cursor_execute(q, database=config.DATABASE)\n" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "clustering...\n", - "0\n", - "10000\n", - "20000\n", - "30000\n", - "40000\n", - "50000\n", - "60000\n", - "70000\n", - "80000\n", - "90000\n", - "100000\n", - "110000\n", - "120000\n", - "130000\n", - "140000\n", - "150000\n", - "160000\n", - "170000\n", - "180000\n", - "190000\n", - "200000\n", - "210000\n", - "220000\n", - "230000\n", - "240000\n", - "250000\n", - "260000\n", - "270000\n", - "280000\n", - "290000\n", - "300000\n", - "310000\n", - "320000\n", - "330000\n", - "340000\n", - "350000\n", - "360000\n", - "370000\n", - "380000\n", - "390000\n", - "400000\n", - "410000\n", - "420000\n", - "430000\n", - "440000\n", - "450000\n", - "460000\n", - "470000\n", - "480000\n", - "490000\n", - "500000\n", - "510000\n", - "520000\n", - "530000\n", - "540000\n", - "550000\n", - "560000\n", - "570000\n", - "580000\n", - "590000\n", - "600000\n", - "610000\n", - "620000\n", - "630000\n", - "640000\n", - "650000\n", - "660000\n", - "670000\n", - "680000\n", - "690000\n", - "700000\n", - "710000\n", - "720000\n", - "730000\n", - "740000\n", - "750000\n", - "760000\n", - "770000\n", - "780000\n", - "790000\n", - "800000\n", - "810000\n", - "820000\n", - "830000\n", - "840000\n", - "850000\n", - "860000\n", - "870000\n", - "880000\n", - "890000\n", - "900000\n", - "910000\n", - "920000\n", - "930000\n", - "940000\n", - "950000\n", - "960000\n", - "970000\n", - "980000\n", - "990000\n", - "1000000\n", - "1010000\n", - "1020000\n", - "1030000\n", - "1040000\n", - "1050000\n", - "1060000\n", - "1070000\n", - "1080000\n", - "1090000\n", - "1100000\n", - "1110000\n", - "1120000\n", - "1130000\n", - "1140000\n", - "1150000\n", - "1160000\n", - "1170000\n", - "1180000\n", - "1190000\n", - "1200000\n", - "1210000\n", - "1220000\n", - "1230000\n", - "1240000\n", - "1250000\n", - "1260000\n", - "1270000\n", - "1280000\n", - "1290000\n", - "1300000\n", - "1310000\n", - "1320000\n", - "1330000\n", - "1340000\n", - "1350000\n", - "1360000\n", - "1370000\n", - "1380000\n", - "1390000\n", - "1400000\n", - "1410000\n", - "1420000\n", - "1430000\n", - "1440000\n", - "1450000\n", - "1460000\n", - "1470000\n", - "1480000\n", - "1490000\n", - "1500000\n", - "1510000\n", - "1520000\n", - "1530000\n", - "1540000\n", - "1550000\n", - "1560000\n", - "1570000\n", - "1580000\n", - "1590000\n", - "1600000\n", - "1610000\n", - "1620000\n", - "1630000\n", - "1640000\n", - "1650000\n", - "1660000\n", - "1670000\n", - "1680000\n", - "1690000\n", - "1700000\n", - "1710000\n", - "1720000\n", - "1730000\n", - "1740000\n", - "1750000\n", - "1760000\n", - "1770000\n", - "1780000\n", - "1790000\n", - "1800000\n", - "1810000\n", - "1820000\n", - "1830000\n", - "1840000\n", - "1850000\n", - "1860000\n", - "1870000\n", - "1880000\n", - "1890000\n", - "1900000\n", - "1910000\n", - "1920000\n", - "1930000\n", - "1940000\n", - "1950000\n", - "1960000\n", - "1970000\n", - "1980000\n", - "1990000\n", - "2000000\n", - "2010000\n", - "2020000\n", - "2030000\n", - "2040000\n", - "2050000\n", - "2060000\n", - "2070000\n", - "2080000\n", - "2090000\n", - "2100000\n", - "2110000\n", - "2120000\n", - "2130000\n", - "2140000\n", - "2150000\n", - "2160000\n", - "2170000\n", - "2180000\n", - "2190000\n", - "2200000\n", - "2210000\n", - "2220000\n", - "2230000\n", - "2240000\n", - "2250000\n", - "2260000\n", - "2270000\n", - "2280000\n", - "2290000\n", - "2300000\n", - "2310000\n", - "2320000\n", - "2330000\n", - "2340000\n", - "2350000\n", - "2360000\n", - "2370000\n", - "2380000\n", - "2390000\n", - "2400000\n", - "2410000\n", - "2420000\n", - "2430000\n", - "2440000\n", - "2450000\n", - "2460000\n", - "2470000\n", - "2480000\n", - "2490000\n", - "2500000\n", - "2510000\n", - "2520000\n", - "2530000\n", - "2540000\n", - "2550000\n", - "2560000\n", - "2570000\n", - "2580000\n", - "2590000\n", - "2600000\n", - "2610000\n", - "2620000\n", - "2630000\n", - "2640000\n", - "2650000\n", - "2660000\n", - "2670000\n", - "2680000\n", - "2690000\n", - "2700000\n", - "2710000\n", - "2720000\n", - "2730000\n", - "2740000\n", - "2750000\n", - "2760000\n", - "2770000\n", - "2780000\n", - "2790000\n", - "2800000\n", - "2810000\n", - "2820000\n", - "2830000\n", - "2840000\n", - "2850000\n", - "2860000\n", - "2870000\n", - "2880000\n", - "2890000\n", - "2900000\n", - "2910000\n", - "2920000\n", - "2930000\n", - "2940000\n", - "2950000\n", - "2960000\n", - "2970000\n", - "2980000\n", - "2990000\n", - "3000000\n", - "3010000\n", - "3020000\n", - "3030000\n", - "3040000\n", - "3050000\n", - "3060000\n", - "3070000\n", - "3080000\n", - "3090000\n", - "3100000\n", - "3110000\n", - "3120000\n", - "3130000\n", - "3140000\n", - "3150000\n", - "3160000\n", - "3170000\n", - "3180000\n", - "3190000\n", - "3200000\n", - "3210000\n", - "3220000\n", - "3230000\n", - "3240000\n", - "3250000\n", - "3260000\n", - "3270000\n", - "3280000\n", - "3290000\n", - "3300000\n", - "3310000\n", - "3320000\n", - "3330000\n", - "3340000\n", - "3350000\n", - "3360000\n", - "3370000\n", - "3380000\n", - "3390000\n", - "3400000\n", - "3410000\n", - "3420000\n", - "3430000\n", - "3440000\n", - "3450000\n", - "3460000\n", - "3470000\n", - "3480000\n", - "3490000\n", - "3500000\n", - "3510000\n", - "3520000\n", - "3530000\n", - "3540000\n", - "3550000\n", - "3560000\n", - "3570000\n", - "3580000\n", - "3590000\n", - "3600000\n", - "3610000\n", - "3620000\n", - "3630000\n", - "3640000\n", - "3650000\n", - "3660000\n", - "3670000\n", - "3680000\n", - "3690000\n", - "3700000\n", - "3710000\n", - "3720000\n", - "3730000\n", - "3740000\n", - "3750000\n", - "3760000\n", - "3770000\n", - "3780000\n", - "3790000\n", - "3800000\n", - "3810000\n", - "3820000\n", - "3830000\n", - "3840000\n", - "3850000\n", - "3860000\n", - "3870000\n", - "3880000\n", - "3890000\n", - "3900000\n", - "3910000\n", - "3920000\n", - "3930000\n", - "3940000\n", - "3950000\n", - "3960000\n", - "3970000\n", - "3980000\n", - "3990000\n", - "4000000\n", - "4010000\n", - "4020000\n", - "4030000\n", - "4040000\n", - "4050000\n", - "4060000\n", - "4070000\n", - "4080000\n", - "4090000\n", - "4100000\n", - "4110000\n", - "4120000\n", - "4130000\n", - "4140000\n", - "4150000\n", - "4160000\n", - "4170000\n", - "4180000\n", - "4190000\n", - "4200000\n", - "4210000\n", - "4220000\n", - "4230000\n", - "4240000\n", - "4250000\n", - "4260000\n", - "4270000\n", - "4280000\n", - "4290000\n", - "4300000\n", - "4310000\n", - "4320000\n", - "4330000\n", - "4340000\n", - "4350000\n", - "4360000\n", - "4370000\n", - "4380000\n", - "4390000\n", - "4400000\n", - "4410000\n", - "4420000\n", - "4430000\n", - "4440000\n", - "4450000\n", - "4460000\n", - "4470000\n", - "4480000\n", - "4490000\n", - "4500000\n", - "4510000\n", - "4520000\n", - "4530000\n", - "4540000\n", - "4550000\n", - "4560000\n", - "4570000\n", - "4580000\n", - "4590000\n", - "4600000\n", - "4610000\n", - "4620000\n", - "4630000\n", - "4640000\n", - "4650000\n", - "4660000\n", - "4670000\n", - "4680000\n", - "4690000\n", - "4700000\n", - "4710000\n", - "4720000\n", - "4730000\n", - "4740000\n", - "4750000\n", - "4760000\n", - "4770000\n", - "4780000\n", - "4790000\n", - "4800000\n", - "4810000\n", - "4820000\n", - "4830000\n", - "4840000\n", - "4850000\n", - "4860000\n", - "4870000\n", - "4880000\n", - "4890000\n", - "4900000\n", - "4910000\n", - "4920000\n", - "4930000\n", - "4940000\n", - "4950000\n", - "4960000\n", - "4970000\n", - "4980000\n", - "4990000\n", - "5000000\n", - "5010000\n", - "5020000\n", - "5030000\n", - "5040000\n", - "5050000\n", - "5060000\n", - "5070000\n", - "5080000\n", - "5090000\n", - "5100000\n", - "5110000\n", - "5120000\n" - ] - } - ], + "outputs": [], "source": [ " # ## Clustering\n", "\n", @@ -1026,79 +324,15 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "creating as_entity_map database\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:dedupe.clustering:A component contained 158378 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 3.048800940974382e-18\n", - "WARNING:dedupe.clustering:A component contained 158378 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.1038451096258602e-17\n", - "WARNING:dedupe.clustering:A component contained 158378 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 4.921472802362437e-17\n", - "WARNING:dedupe.clustering:A component contained 158376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.3509749469939632e-16\n", - "WARNING:dedupe.clustering:A component contained 158376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 3.753198598980096e-16\n", - "WARNING:dedupe.clustering:A component contained 158376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.0403512617910142e-15\n", - "WARNING:dedupe.clustering:A component contained 158376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 2.894390795098561e-15\n", - "WARNING:dedupe.clustering:A component contained 158376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 8.13221083415663e-15\n", - "WARNING:dedupe.clustering:A component contained 158376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 2.2319167959587083e-14\n", - "WARNING:dedupe.clustering:A component contained 158375 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 6.068761194789993e-14\n", - "WARNING:dedupe.clustering:A component contained 158372 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.6520029686587212e-13\n", - "WARNING:dedupe.clustering:A component contained 158364 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 4.5125040046676256e-13\n", - "WARNING:dedupe.clustering:A component contained 158352 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.2269463311618112e-12\n", - "WARNING:dedupe.clustering:A component contained 158314 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 3.3356439660236965e-12\n", - "WARNING:dedupe.clustering:A component contained 157999 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 9.069763004960628e-12\n", - "WARNING:dedupe.clustering:A component contained 157528 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 2.4668471436592435e-11\n", - "WARNING:dedupe.clustering:A component contained 157002 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 6.705726454279488e-11\n", - "WARNING:dedupe.clustering:A component contained 156034 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.822899310600237e-10\n", - "WARNING:dedupe.clustering:A component contained 153167 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 4.955617200886392e-10\n", - "WARNING:dedupe.clustering:A component contained 150749 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.3472511446191333e-09\n", - "WARNING:dedupe.clustering:A component contained 148126 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 3.663451859737113e-09\n", - "WARNING:dedupe.clustering:A component contained 144445 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 9.961619074337575e-09\n", - "WARNING:dedupe.clustering:A component contained 140752 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 2.7079365851019727e-08\n", - "WARNING:dedupe.clustering:A component contained 136821 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 7.361173277021834e-08\n", - "WARNING:dedupe.clustering:A component contained 132985 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 2.00129481181664e-07\n", - "WARNING:dedupe.clustering:A component contained 129188 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 5.440113266301783e-07\n", - "WARNING:dedupe.clustering:A component contained 126461 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.4789049802767608e-06\n", - "WARNING:dedupe.clustering:A component contained 124279 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 4.020875427015852e-06\n", - "WARNING:dedupe.clustering:A component contained 121039 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.0930919387732102e-05\n", - "WARNING:dedupe.clustering:A component contained 117376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 2.971327846301476e-05\n", - "WARNING:dedupe.clustering:A component contained 114455 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 8.076722851404745e-05\n", - "WARNING:dedupe.clustering:A component contained 109969 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.0002195200394847895\n", - "WARNING:dedupe.clustering:A component contained 106867 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.0005965000236037636\n", - "WARNING:dedupe.clustering:A component contained 101488 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.001619959237855584\n", - "WARNING:dedupe.clustering:A component contained 94945 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.004391296209574281\n", - "WARNING:dedupe.clustering:A component contained 89944 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.01184744490841891\n", - "WARNING:dedupe.clustering:A component contained 85759 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.031562819826922474\n", - "WARNING:dedupe.clustering:A component contained 79119 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.08138832068049236\n", - "WARNING:dedupe.clustering:A component contained 73185 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.1940994212134007\n", - "WARNING:dedupe.clustering:A component contained 67046 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.39565940016357204\n", - "WARNING:dedupe.clustering:A component contained 57601 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.6402437741869547\n", - "WARNING:dedupe.clustering:A component contained 36731 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.8286982262391892\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "s3://ria-temp/as_dedupe/as_entity_map/as_entity_map_0.csv\n", - "s3://ria-temp/as_dedupe/as_entity_map/as_entity_map_1.csv\n", - "s3://ria-temp/as_dedupe/as_entity_map/as_entity_map_2.csv\n", - "s3://ria-temp/as_dedupe/as_entity_map/as_entity_map_3.csv\n", - "s3://ria-temp/as_dedupe/as_entity_map/as_entity_map_4.csv\n" - ] - } - ], + "outputs": [], "source": [ - " athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_entity_map\", database=config.DATABASE)\n", - "\n", + "# athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_entity_map\", database=config.DATABASE)\n", + " athenautils.drop_external_table(\"as_entity_map\", \n", + " location='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/'), \n", + " database=config.DATABASE)\n", + " \n", " print('creating as_entity_map database')\n", " q=\"\"\"\n", " CREATE EXTERNAL TABLE as_entity_map \n", @@ -1117,45 +351,14 @@ " athenautils.athena_start_query(q, database=config.DATABASE) \n", "\n", " athenautils.write_many(cluster_ids(clustered_dupes),\n", - " filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY, 'as_entity_map/as_entity_map.csv'))" + " filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/entity_map.csv'))" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "# duplicate sets\n", - "Top Donors (deduped)\n", - " $32,146,134.06: democratic party of illinois\n", - " $13,762,181.54: republican state senate campaign committee\n", - " $9,590,682.54: republican governors association\n", - " $9,040,913.46: madigan michael friends of\n", - " $7,949,218.49: seiu healthcare il in pac\n", - " $6,435,815.20: chicago teachers union, ift local 1\n", - " $6,353,463.90: illinois senate democratic fund (the)\n", - " $6,077,259.02: fred eychaner\n", - " $6,022,884.47: scott cohen\n", - " $5,911,667.89: illinois republican party\n", - "Top Donors (raw)\n", - " $14,319,194.47: democratic party of illinois\n", - " $13,020,132.76: democratic party of illinois\n", - " $9,027,432.54: republican governors association\n", - " $7,897,829.31: rga illinois 2010 pac\n", - " $6,675,000.00: madigan michael friends of\n", - " $6,008,841.69: scott cohen\n", - " $5,570,839.00: ronald gidwitz,\n", - " $5,562,800.00: citizens for emil jones\n", - " $5,324,649.63: paul wood,\n", - " $5,132,563.83: seiu healthcare il in\n", - "ran in 3723.114373922348 seconds\n" - ] - } - ], + "outputs": [], "source": [ " # Print out the number of duplicates found\n", " print('# duplicate sets')\n", @@ -1193,7 +396,7 @@ " ON as_donors.donor_id = donation_totals.canon_id\n", " ORDER BY totals DESC\n", " \"\"\"\n", - " cur = athenautils.cursor_execute(q, database=config.DATABASE)\n", + " cur = dict_cursor_execute(q, database=config.DATABASE)\n", "\n", " print(\"Top Donors (deduped)\")\n", " for row in cur:\n", @@ -1217,27 +420,32 @@ " ORDER BY totals DESC \n", " LIMIT 10\n", " \"\"\"\n", - " cur = athenautils.cursor_execute(q, database=config.DATABASE)\n", + " cur = dict_cursor_execute(q, database=config.DATABASE)\n", "\n", " print(\"Top Donors (raw)\")\n", " for row in cur:\n", " row['totals'] = locale.currency(row['totals'], grouping=True)\n", " print('%(totals)20s: %(name)s' % row)\n", "\n", - " # Close our database connection\n", - "# read_con.close()\n", - "# write_con.close()\n", - "\n", " print('ran in', time.time() - start_time, 'seconds')" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[NbConvertApp] Converting notebook athena_example.ipynb to script\n", + "[NbConvertApp] Writing 12622 bytes to ../athena_example/athena_example.py\n" + ] + } + ], "source": [ - "# !jupyter nbconvert --to script athena_example.ipynb --output-dir=../athena_example/" + "!jupyter nbconvert --to script athena_example.ipynb --output-dir=../athena_example/\n" ] }, { diff --git a/notebooks/athena_init_db.ipynb b/notebooks/athena_init_db.ipynb index a059f520..d35250de 100644 --- a/notebooks/athena_init_db.ipynb +++ b/notebooks/athena_init_db.ipynb @@ -2,20 +2,22 @@ "cells": [ { "cell_type": "code", - "execution_count": 4, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install dedupe" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Overwriting ../athena_example/config.py\n" - ] - } - ], + "outputs": [], "source": [ "%%writefile ../athena_example/config.py\n", "LOG_FILE = 'log.txt'\n", + "\n", "# Connection parameters\n", "ACCESS_KEY_ID = None\n", "SECRET_ACCESS_KEY = None\n", @@ -32,17 +34,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Overwriting ../athena_example/athena_init.py\n" - ] - } - ], + "outputs": [], "source": [ "%%writefile ../athena_example/athena_init.py\n", "#!/usr/bin/python\n", @@ -98,7 +92,9 @@ "\n", "\n", "print('importing raw data from csv...')\n", - "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_raw_table\", database=config.DATABASE)\n", + "athenautils.drop_external_table(\"as_raw_table\", \n", + " location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table'),\n", + " database=config.DATABASE) \n", "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_donors\", database=config.DATABASE)\n", "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_recipients\", database=config.DATABASE)\n", "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_contributions\", database=config.DATABASE)\n", @@ -134,39 +130,40 @@ "athenautils.athena_start_query(q, database=config.DATABASE)\n", "\n", "\n", - "df = pd.read_csv(contributions_txt_file, sep='\\t', escapechar='\\\\', quoting=csv.QUOTE_NONE, \n", - " error_bad_lines=False, warn_bad_lines=True, dtype=str, keep_default_na=False, na_values=[''])#,\n", - "\n", - "# Remove the very few records that mess up the demo \n", - "# (demo purposes only! Don't do something like this in production)\n", - "df = df[df['RcvDate'].str.len()>=10]\n", - "\n", - "# set empty, non-zero, strings in date columns to null\n", - "df.loc[df['RptPdBegDate'].str.len()<10,'RptPdBegDate'] = np.nan\n", - "\n", - "df.loc[df['RptPdEndDate'].str.len()<10,'RptPdEndDate'] = np.nan\n", - "\n", - "#committee ID is requred. Remove the 2 rows that don't have it.\n", - "df = df[df['ID']!='']\n", - "\n", - "# There's a record with a date stuck in the committee_id column, which causes\n", - "# problems when inserting into the contributions table below. Get rid of it this \n", - "# way.\n", - "df = df[df['ID'].str.len() <=9]\n", - "\n", - "# dropping the last columns\n", - "df = df.drop(columns='Unnamed: 29')\n", - "\n", - "# Nullifying empty strings\n", - "# df = df.replace(r'^\\s*$', np.nan, regex=True)\n", - "df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand')\n", - "\n", - "athenautils.write(body=df_lower.to_csv(quoting=csv.QUOTE_NONE, sep=\"\\t\", escapechar='\\\\', index=None),\n", - " filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'as_raw_table', contributions_txt_file,))\n", - "\n", - "# Athena is doesn't equate empty string and null, eventhough in the table spec we said so\n", - "# Not that it's a bug, it works if the string is null in the source, but not after applying trim to it\n", - "# So we need to manually take care of that\n", + "df_cursor = pd.read_csv(contributions_txt_file, sep='\\t', escapechar='\\\\', quoting=csv.QUOTE_NONE, \n", + " error_bad_lines=False, warn_bad_lines=True, dtype=str, keep_default_na=False, na_values=[''],\n", + " chunksize=config.BUFFERSIZE)\n", + "chunkcount = 0\n", + "filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'as_raw_table', os.path.splitext(contributions_txt_file)[0]+'.csv')\n", + "for df in df_cursor: \n", + " # Remove the very few records that mess up the demo \n", + " # (demo purposes only! Don't do something like this in production)\n", + " df = df[df['RcvDate'].str.len()>=10]\n", + "\n", + " # set empty, non-zero, strings in date columns to null\n", + " df.loc[df['RptPdBegDate'].str.len()<10,'RptPdBegDate'] = np.nan\n", + "\n", + " df.loc[df['RptPdEndDate'].str.len()<10,'RptPdEndDate'] = np.nan\n", + "\n", + " #committee ID is requred. Remove the 2 rows that don't have it.\n", + " df = df[df['ID']!='']\n", + "\n", + " # There's a record with a date stuck in the committee_id column, which causes\n", + " # problems when inserting into the contributions table below. Get rid of it this \n", + " # way.\n", + " df = df[df['ID'].str.len() <=9]\n", + "\n", + " # dropping the last columns\n", + " df = df.drop(columns='Unnamed: 29')\n", + "\n", + " df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand')\n", + " \n", + " buffer = df_lower.to_csv(quoting=csv.QUOTE_NONE, sep=\"\\t\", escapechar='\\\\', index=None)\n", + " \n", + " chunk_fname = athenautils.file_name_append(filename, '_{}'.format(chunkcount), ommitext=False)\n", + " athenautils.write(body=buffer, filename=chunk_fname)\n", + " chunkcount += 1 \n", + " \n", "print('creating donors table...')\n", "q=\"\"\"\n", "CREATE TABLE as_donors as\n", @@ -194,22 +191,6 @@ "\n", "print('creating contributions table')\n", "\n", - "# --\n", - "# c.execute(\"CREATE TABLE contributions \"\n", - "# \"(contribution_id INT, donor_id INT, recipient_id INT, \"\n", - "# \" report_type VARCHAR(24), date_recieved DATE, \"\n", - "# \" loan_amount VARCHAR(12), amount VARCHAR(23), \"\n", - "# \" receipt_type VARCHAR(23), \"\n", - "# \" vendor_last_name VARCHAR(70), \"\n", - "# \" vendor_first_name VARCHAR(20), \"\n", - "# \" vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), \"\n", - "# \" vendor_city VARCHAR(20), vendor_state VARCHAR(10), \"\n", - "# \" vendor_zip VARCHAR(10), description VARCHAR(90), \"\n", - "# \" election_type VARCHAR(10), election_year VARCHAR(10), \"\n", - "# \" report_period_begin DATE, report_period_end DATE) \"\n", - "# \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n", - "# --\n", - "\n", "q=\"\"\"\n", "CREATE TABLE as_contributions as\n", " SELECT reciept_id as contribution_id, \n", @@ -264,35 +245,12 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "importing raw data from csv...\n", - "b'Skipping line 1441352: expected 30 fields, saw 31\\n'\n", - "b'Skipping line 1465996: expected 30 fields, saw 31\\n'\n", - "b'Skipping line 1495732: expected 30 fields, saw 31\\n'\n", - "b'Skipping line 1631504: expected 30 fields, saw 31\\nSkipping line 1631506: expected 30 fields, saw 31\\n'\n", - "b'Skipping line 1660260: expected 30 fields, saw 31\\nSkipping line 1660264: expected 30 fields, saw 32\\n'\n", - "creating donors table...\n", - "creating contributions table\n", - "done\n" - ] - } - ], + "outputs": [], "source": [ "!python ../athena_example/athena_init.py" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From 5636f41b6ac9531ca2d6118f4fbfe520aae57711 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Wed, 4 Nov 2020 06:37:31 +0000 Subject: [PATCH 11/19] checkpoint --- athena_example/athena_example.py | 332 ++----------------------------- notebooks/athena_example.ipynb | 145 +++++--------- 2 files changed, 68 insertions(+), 409 deletions(-) diff --git a/athena_example/athena_example.py b/athena_example/athena_example.py index 384172f2..7cedbfe6 100644 --- a/athena_example/athena_example.py +++ b/athena_example/athena_example.py @@ -1,8 +1,3 @@ -#!/usr/bin/env python -# coding: utf-8 - -# In[ ]: - """ This is an example of working with very large data. There are about @@ -80,8 +75,8 @@ def cluster_ids(clustered_dupes): for donor_id, score in zip(cluster, scores): yield donor_id, cluster_id, score - -if __name__ == '__main__': +if True: +# if __name__ == '__main__': ## Logging @@ -89,20 +84,20 @@ def cluster_ids(clustered_dupes): # for convenience. To enable verbose output, run `python # examples/mysql_example/mysql_example.py -v` - optp = optparse.OptionParser() - optp.add_option('-v', '--verbose', dest='verbose', action='count', - help='Increase verbosity (specify multiple times for more)' - ) - (opts, args) = optp.parse_args() +# optp = optparse.OptionParser() +# optp.add_option('-v', '--verbose', dest='verbose', action='count', +# help='Increase verbosity (specify multiple times for more)' +# ) +# (opts, args) = optp.parse_args() log_level = logging.WARNING - if opts.verbose: - if opts.verbose == 1: - log_level = logging.INFO - elif opts.verbose >= 2: - log_level = logging.DEBUG +# if opts.verbose: +# if opts.verbose == 1: +# log_level = logging.INFO +# elif opts.verbose >= 2: +# log_level = logging.DEBUG - logging.getLogger().setLevel(log_level) +# logging.getLogger().setLevel(log_level) @@ -111,304 +106,3 @@ def cluster_ids(clustered_dupes): training_file = 'mysql_example_training.json' start_time = time.time() - - -# In[ ]: - - -# We'll be using variations on this following select statement to pull -# in campaign donor info. -# -# We did a fair amount of preprocessing of the fields in -# `mysql_init_db.py` -DONOR_SELECT = """SELECT donor_id, city, name, zip, state, address - from as_processed_donors""" - -# ## Training - -if os.path.exists(settings_file): - print('reading from ', settings_file) - with open(settings_file, 'rb') as sf: - deduper = dedupe.StaticDedupe(sf, num_cores=4) -else: - # Define the fields dedupe will pay attention to - # - # The address, city, and zip fields are often missing, so we'll - # tell dedupe that, and we'll learn a model that take that into - # account - fields = [{'field': 'name', 'type': 'String'}, - {'field': 'address', 'type': 'String', - 'has missing': True}, - {'field': 'city', 'type': 'ShortString', 'has missing': True}, - {'field': 'state', 'type': 'ShortString', 'has missing': True}, - {'field': 'zip', 'type': 'ShortString', 'has missing': True}, - ] - - # Create a new deduper object and pass our data model to it. - deduper = dedupe.Dedupe(fields, num_cores=4) - - # We will sample pairs from the entire donor table for training - cur = cur_execute(DONOR_SELECT) - temp_d = {i: row for i, row in enumerate(cur)} - - - # If we have training data saved from a previous run of dedupe, - # look for it an load it in. - # - # __Note:__ if you want to train from - # scratch, delete the training_file - if os.path.exists(training_file): - print('reading labeled examples from ', training_file) - with open(training_file) as tf: - deduper.prepare_training(temp_d, training_file=tf) - else: - deduper.prepare_training(temp_d) - - del temp_d - - # ## Active learning - - print('starting active labeling...') - # Starts the training loop. Dedupe will find the next pair of records - # it is least certain about and ask you to label them as duplicates - # or not. - - # use 'y', 'n' and 'u' keys to flag duplicates - # press 'f' when you are finished - dedupe.convenience.console_label(deduper) - # When finished, save our labeled, training pairs to disk - with open(training_file, 'w') as tf: - deduper.write_training(tf) - - # Notice our the argument here - # - # `recall` is the proportion of true dupes pairs that the learned - # rules must cover. You may want to reduce this if your are making - # too many blocks and too many comparisons. - deduper.train(recall=0.90) - - with open(settings_file, 'wb') as sf: - deduper.write_settings(sf) - - # We can now remove some of the memory hobbing objects we used - # for training - deduper.cleanup_training() - - -# In[ ]: - - -# ## Blocking - -print('blocking...') - -# To run blocking on such a large set of data, we create a separate table -# that contains blocking keys and record ids -print('creating as_blocking_map database') -athenautils.drop_external_table("as_blocking_map", - location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map'), - database=config.DATABASE) - -q=""" -CREATE EXTERNAL TABLE as_blocking_map - (block_key VARCHAR(200), donor_id INTEGER) -ROW FORMAT DELIMITED - FIELDS TERMINATED BY '\t' - LINES TERMINATED BY '\n' -LOCATION - 's3://{}/{}' -TBLPROPERTIES ( - 'classification'='csv', - --'skip.header.line.count'='1', - 'serialization.null.format'='') -""".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map') -athenautils.athena_start_query(q, database=config.DATABASE) - - -# In[ ]: - - -# If dedupe learned a Index Predicate, we have to take a pass -# through the data and create indices. -print('creating inverted index') - -# Armin: -# This never runs, index_fields is empty, possible bug? -for field in deduper.fingerprinter.index_fields: - q = """ - SELECT DISTINCT {field} FROM as_processed_donors - WHERE {field} IS NOT NULL - """.format(field=field) - cur = cur_execute(q) - field_data = (row[field] for row in cur) - deduper.fingerprinter.index(field_data, field) - - - -# In[ ]: - - -# Now we are ready to write our blocking map table by creating a -# generator that yields unique `(block_key, donor_id)` tuples. -print('writing blocking map') - -read_cur = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE) -full_data = ((row['donor_id'], row) for row in read_cur) - - -# In[ ]: - - -b_data = deduper.fingerprinter(full_data) -athenautils.write_many(b_data, - filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map/blocking.csv')) - - -# In[ ]: - - - - # select unique pairs to compare - q=""" - SELECT a.donor_id, - json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], - ARRAY[ a.city, a.name, a.zip, a.state, a.address]) - AS JSON)), - b.donor_id, - json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], - ARRAY[ b.city, b.name, b.zip, b.state, b.address]) - AS JSON)) - FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west - from as_blocking_map as l - INNER JOIN as_blocking_map as r - using (block_key) - where l.donor_id < r.donor_id) ids - INNER JOIN as_processed_donors a on ids.east=a.donor_id - INNER JOIN as_processed_donors b on ids.west=b.donor_id - """ - read_cur = cursor_execute(q, database=config.DATABASE) - - -# In[ ]: - - -# ## Clustering - -print('clustering...') -clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)), - threshold=0.5) - - -# In[ ]: - - -# athenautils.athena_start_query("DROP TABLE IF EXISTS as_entity_map", database=config.DATABASE) - athenautils.drop_external_table("as_entity_map", - location='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/'), - database=config.DATABASE) - - print('creating as_entity_map database') - q=""" - CREATE EXTERNAL TABLE as_entity_map - (donor_id INTEGER, canon_id INTEGER, - cluster_score FLOAT) - ROW FORMAT DELIMITED - FIELDS TERMINATED BY '\t' - LINES TERMINATED BY '\n' - LOCATION - 's3://{}/{}' - TBLPROPERTIES ( - 'classification'='csv', - --'skip.header.line.count'='1', - 'serialization.null.format'='') - """.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map') - athenautils.athena_start_query(q, database=config.DATABASE) - - athenautils.write_many(cluster_ids(clustered_dupes), - filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/entity_map.csv')) - - -# In[ ]: - - -# Print out the number of duplicates found -print('# duplicate sets') - -# ## Payoff - -# With all this done, we can now begin to ask interesting questions -# of the data -# -# For example, let's see who the top 10 donors are. - -locale.setlocale(locale.LC_ALL, 'en_CA.UTF-8') # for pretty printing numbers - -athenautils.athena_start_query("DROP TABLE IF EXISTS as_e_map", database=config.DATABASE) - -q = """ - CREATE TABLE as_e_map as - SELECT COALESCE(canon_id, as_entity_map.donor_id) AS canon_id, as_entity_map.donor_id - FROM as_entity_map - RIGHT JOIN as_donors USING(donor_id) - """ -athenautils.athena_start_query(q, database=config.DATABASE) - -q = """ - SELECT array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name, - donation_totals.totals AS totals - FROM as_donors INNER JOIN - (SELECT canon_id, SUM(cast (amount as double)) AS totals - FROM as_contributions INNER JOIN as_e_map - USING (donor_id) - GROUP BY (canon_id) - ORDER BY totals - DESC LIMIT 10) - AS donation_totals - ON as_donors.donor_id = donation_totals.canon_id - ORDER BY totals DESC -""" -cur = dict_cursor_execute(q, database=config.DATABASE) - -print("Top Donors (deduped)") -for row in cur: - row['totals'] = locale.currency(row['totals'], grouping=True) - print('%(totals)20s: %(name)s' % row) - -# Compare this to what we would have gotten if we hadn't done any -# deduplication -q = """ - with donorscontributions as( - - SELECT as_donors.donor_id, - array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name, - cast(as_contributions.amount as double) as amount - FROM as_donors INNER JOIN as_contributions - USING (donor_id) - ) - SELECT name, sum(amount) AS totals - FROM donorscontributions - GROUP BY donor_id, name - ORDER BY totals DESC - LIMIT 10 -""" -cur = dict_cursor_execute(q, database=config.DATABASE) - -print("Top Donors (raw)") -for row in cur: - row['totals'] = locale.currency(row['totals'], grouping=True) - print('%(totals)20s: %(name)s' % row) - -print('ran in', time.time() - start_time, 'seconds') - - -# In[ ]: - - -get_ipython().system('jupyter nbconvert --to script athena_example.ipynb --output-dir=../athena_example/') - - -# In[ ]: - - - - diff --git a/notebooks/athena_example.ipynb b/notebooks/athena_example.ipynb index e452939c..69edb207 100644 --- a/notebooks/athena_example.ipynb +++ b/notebooks/athena_example.ipynb @@ -2,10 +2,20 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting ../athena_example/athena_example.py\n" + ] + } + ], "source": [ + "%%writefile ../athena_example/athena_example.py\n", + "\n", "\"\"\"\n", "This is an example of working with very large data. There are about\n", "700,000 unduplicated donors in this database of Illinois political\n", @@ -112,15 +122,8 @@ " settings_file = 'mysql_example_settings'\n", " training_file = 'mysql_example_training.json'\n", "\n", - " start_time = time.time()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + " start_time = time.time()\n", + "\n", " # We'll be using variations on this following select statement to pull\n", " # in campaign donor info.\n", " #\n", @@ -153,7 +156,7 @@ " deduper = dedupe.Dedupe(fields, num_cores=4)\n", "\n", " # We will sample pairs from the entire donor table for training\n", - " cur = cur_execute(DONOR_SELECT)\n", + " cur = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE)\n", " temp_d = {i: row for i, row in enumerate(cur)}\n", " \n", "\n", @@ -197,15 +200,8 @@ "\n", " # We can now remove some of the memory hobbing objects we used\n", " # for training\n", - " deduper.cleanup_training()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + " deduper.cleanup_training()\n", + "\n", " # ## Blocking\n", "\n", " print('blocking...')\n", @@ -230,15 +226,8 @@ " --'skip.header.line.count'='1', \n", " 'serialization.null.format'='')\n", " \"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map') \n", - " athenautils.athena_start_query(q, database=config.DATABASE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + " athenautils.athena_start_query(q, database=config.DATABASE)\n", + "\n", " # If dedupe learned a Index Predicate, we have to take a pass\n", " # through the data and create indices.\n", " print('creating inverted index')\n", @@ -250,43 +239,22 @@ " SELECT DISTINCT {field} FROM as_processed_donors\n", " WHERE {field} IS NOT NULL\n", " \"\"\".format(field=field)\n", - " cur = cur_execute(q)\n", + " cur = dict_cursor_execute(q, databse=config.DATABASE)\n", " field_data = (row[field] for row in cur)\n", " deduper.fingerprinter.index(field_data, field)\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + " \n", + "\n", " # Now we are ready to write our blocking map table by creating a\n", " # generator that yields unique `(block_key, donor_id)` tuples.\n", " print('writing blocking map')\n", " \n", " read_cur = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE)\n", - " full_data = ((row['donor_id'], row) for row in read_cur)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + " full_data = ((row['donor_id'], row) for row in read_cur)\n", + "\n", " b_data = deduper.fingerprinter(full_data)\n", " athenautils.write_many(b_data, \n", - " filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map/blocking.csv'))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + " filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map/blocking.csv'))\n", + "\n", "\n", " # select unique pairs to compare\n", " q=\"\"\"\n", @@ -306,28 +274,15 @@ " INNER JOIN as_processed_donors a on ids.east=a.donor_id\n", " INNER JOIN as_processed_donors b on ids.west=b.donor_id\n", " \"\"\"\n", - " read_cur = cursor_execute(q, database=config.DATABASE)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + " read_cur = cursor_execute(q, database=config.DATABASE)\n", + "\n", + "\n", " # ## Clustering\n", "\n", " print('clustering...')\n", " clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)),\n", - " threshold=0.5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + " threshold=0.5)\n", + "\n", "# athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_entity_map\", database=config.DATABASE)\n", " athenautils.drop_external_table(\"as_entity_map\", \n", " location='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/'), \n", @@ -351,15 +306,8 @@ " athenautils.athena_start_query(q, database=config.DATABASE) \n", "\n", " athenautils.write_many(cluster_ids(clustered_dupes),\n", - " filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/entity_map.csv'))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + " filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/entity_map.csv'))\n", + "\n", " # Print out the number of duplicates found\n", " print('# duplicate sets')\n", "\n", @@ -427,25 +375,42 @@ " row['totals'] = locale.currency(row['totals'], grouping=True)\n", " print('%(totals)20s: %(name)s' % row)\n", "\n", - " print('ran in', time.time() - start_time, 'seconds')" + " print('ran in', time.time() - start_time, 'seconds')\n" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[NbConvertApp] Converting notebook athena_example.ipynb to script\n", - "[NbConvertApp] Writing 12622 bytes to ../athena_example/athena_example.py\n" + "^C\r\n", + "Traceback (most recent call last):\r\n", + " File \"../athena_example/athena_example.py\", line 156, in \r\n", + " deduper.prepare_training(temp_d)\r\n", + " File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/api.py\", line 1249, in prepare_training\r\n", + " self._sample(data, sample_size, blocked_proportion, original_length)\r\n", + " File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/api.py\", line 1287, in _sample\r\n", + " index_include=examples)\r\n", + " File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/labeler.py\", line 418, in __init__\r\n", + " index_include)\r\n", + " File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/labeler.py\", line 246, in __init__\r\n", + " index_data)\r\n", + " File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/training.py\", line 128, in __init__\r\n", + " simple_cover = self.coveredPairs(self.blocker, sampled_records)\r\n", + " File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/training.py\", line 156, in coveredPairs\r\n", + " for block in pred_cover.values()\r\n", + " File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/training.py\", line 157, in \r\n", + " for pair in itertools.combinations(sorted(block), 2)}\r\n", + "KeyboardInterrupt\r\n" ] } ], "source": [ - "!jupyter nbconvert --to script athena_example.ipynb --output-dir=../athena_example/\n" + "!python ../athena_example/athena_example.py" ] }, { From 7a8caf62200649399c24f2fddeb6cdc16296302f Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Thu, 5 Nov 2020 13:46:04 +0000 Subject: [PATCH 12/19] final version --- athena_example/athena_example.py | 293 ++++++++++++++++++-- athena_example/utils.py | 139 ++++++++++ notebooks/athena_example.ipynb | 452 ------------------------------- notebooks/athena_init_db.ipynb | 277 ------------------- 4 files changed, 412 insertions(+), 749 deletions(-) create mode 100644 athena_example/utils.py delete mode 100644 notebooks/athena_example.ipynb delete mode 100644 notebooks/athena_init_db.ipynb diff --git a/athena_example/athena_example.py b/athena_example/athena_example.py index 7cedbfe6..c1c7fb3c 100644 --- a/athena_example/athena_example.py +++ b/athena_example/athena_example.py @@ -8,18 +8,18 @@ we need to make in memory. Instead, we will read the pairs on demand from the Athena database. -__Note:__ You will need to run `python mysql_init_db.py` +__Note:__ You will need to run `python athena_init_db.py` before running this script. See the annotates source for -[mysql_init_db.py](mysql_init_db.html) +[athena_init_db.py](athena_init_db.html) For smaller datasets (<10,000), see our [csv_example](csv_example.html) """ # There is a little bit difference between the result -# of this module and the mysql one. The reason is due to +# of this module and the athena one. The reason is due to # Some special (and mostly erroneous) characters, such as \a .. -# Which are dealt with differently by mysql and athena/panda +# Which are dealt with differently by athena and athena/panda import sys import os @@ -75,34 +75,287 @@ def cluster_ids(clustered_dupes): for donor_id, score in zip(cluster, scores): yield donor_id, cluster_id, score -if True: -# if __name__ == '__main__': + +if __name__ == '__main__': ## Logging # Dedupe uses Python logging to show or suppress verbose output. Added # for convenience. To enable verbose output, run `python - # examples/mysql_example/mysql_example.py -v` + # examples/athena_example/athena_example.py -v` -# optp = optparse.OptionParser() -# optp.add_option('-v', '--verbose', dest='verbose', action='count', -# help='Increase verbosity (specify multiple times for more)' -# ) -# (opts, args) = optp.parse_args() + optp = optparse.OptionParser() + optp.add_option('-v', '--verbose', dest='verbose', action='count', + help='Increase verbosity (specify multiple times for more)' + ) + (opts, args) = optp.parse_args() log_level = logging.WARNING -# if opts.verbose: -# if opts.verbose == 1: -# log_level = logging.INFO -# elif opts.verbose >= 2: -# log_level = logging.DEBUG + if opts.verbose: + if opts.verbose == 1: + log_level = logging.INFO + elif opts.verbose >= 2: + log_level = logging.DEBUG -# logging.getLogger().setLevel(log_level) + logging.getLogger().setLevel(log_level) - settings_file = 'mysql_example_settings' - training_file = 'mysql_example_training.json' + settings_file = 'athena_example_settings' + training_file = 'athena_example_training.json' start_time = time.time() + + # We'll be using variations on this following select statement to pull + # in campaign donor info. + # + # We did a fair amount of preprocessing of the fields in + # `athena_init_db.py` + DONOR_SELECT = """SELECT donor_id, city, name, zip, state, address + from as_processed_donors""" + + # ## Training + + if os.path.exists(settings_file): + print('reading from ', settings_file) + with open(settings_file, 'rb') as sf: + deduper = dedupe.StaticDedupe(sf, num_cores=4) + else: + # Define the fields dedupe will pay attention to + # + # The address, city, and zip fields are often missing, so we'll + # tell dedupe that, and we'll learn a model that take that into + # account + fields = [{'field': 'name', 'type': 'String'}, + {'field': 'address', 'type': 'String', + 'has missing': True}, + {'field': 'city', 'type': 'ShortString', 'has missing': True}, + {'field': 'state', 'type': 'ShortString', 'has missing': True}, + {'field': 'zip', 'type': 'ShortString', 'has missing': True}, + ] + + # Create a new deduper object and pass our data model to it. + deduper = dedupe.Dedupe(fields, num_cores=4) + + # We will sample pairs from the entire donor table for training + cur = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE) + temp_d = {i: row for i, row in enumerate(cur)} + + + # If we have training data saved from a previous run of dedupe, + # look for it an load it in. + # + # __Note:__ if you want to train from + # scratch, delete the training_file + if os.path.exists(training_file): + print('reading labeled examples from ', training_file) + with open(training_file) as tf: + deduper.prepare_training(temp_d, training_file=tf) + else: + deduper.prepare_training(temp_d) + + del temp_d + + # ## Active learning + + print('starting active labeling...') + # Starts the training loop. Dedupe will find the next pair of records + # it is least certain about and ask you to label them as duplicates + # or not. + + # use 'y', 'n' and 'u' keys to flag duplicates + # press 'f' when you are finished + dedupe.convenience.console_label(deduper) + # When finished, save our labeled, training pairs to disk + with open(training_file, 'w') as tf: + deduper.write_training(tf) + + # Notice our the argument here + # + # `recall` is the proportion of true dupes pairs that the learned + # rules must cover. You may want to reduce this if your are making + # too many blocks and too many comparisons. + deduper.train(recall=0.90) + + with open(settings_file, 'wb') as sf: + deduper.write_settings(sf) + + # We can now remove some of the memory hobbing objects we used + # for training + deduper.cleanup_training() + + # ## Blocking + + print('blocking...') + + # To run blocking on such a large set of data, we create a separate table + # that contains blocking keys and record ids + print('creating as_blocking_map database') + athenautils.drop_external_table("as_blocking_map", + location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map'), + database=config.DATABASE) + + q=""" + CREATE EXTERNAL TABLE as_blocking_map + (block_key VARCHAR(200), donor_id INTEGER) + ROW FORMAT DELIMITED + FIELDS TERMINATED BY '\t' + LINES TERMINATED BY '\n' + LOCATION + 's3://{}/{}' + TBLPROPERTIES ( + 'classification'='csv', + --'skip.header.line.count'='1', + 'serialization.null.format'='') + """.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map') + athenautils.athena_start_query(q, database=config.DATABASE) + + # If dedupe learned a Index Predicate, we have to take a pass + # through the data and create indices. + print('creating inverted index') + + # Armin: + # This never runs, index_fields is empty, possible bug? + for field in deduper.fingerprinter.index_fields: + q = """ + SELECT DISTINCT {field} FROM as_processed_donors + WHERE {field} IS NOT NULL + """.format(field=field) + cur = dict_cursor_execute(q, databse=config.DATABASE) + field_data = (row[field] for row in cur) + deduper.fingerprinter.index(field_data, field) + + + # Now we are ready to write our blocking map table by creating a + # generator that yields unique `(block_key, donor_id)` tuples. + print('writing blocking map') + + read_cur = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE) + full_data = ((row['donor_id'], row) for row in read_cur) + + b_data = deduper.fingerprinter(full_data) + athenautils.write_many(b_data, + filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map/blocking.csv')) + + + # select unique pairs to compare + q=""" + SELECT a.donor_id, + json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], + ARRAY[ a.city, a.name, a.zip, a.state, a.address]) + AS JSON)), + b.donor_id, + json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], + ARRAY[ b.city, b.name, b.zip, b.state, b.address]) + AS JSON)) + FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west + from as_blocking_map as l + INNER JOIN as_blocking_map as r + using (block_key) + where l.donor_id < r.donor_id) ids + INNER JOIN as_processed_donors a on ids.east=a.donor_id + INNER JOIN as_processed_donors b on ids.west=b.donor_id + """ + read_cur = cursor_execute(q, database=config.DATABASE) + + + # ## Clustering + + print('clustering...') + clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)), + threshold=0.5) + +# athenautils.athena_start_query("DROP TABLE IF EXISTS as_entity_map", database=config.DATABASE) + athenautils.drop_external_table("as_entity_map", + location='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/'), + database=config.DATABASE) + + print('creating as_entity_map database') + q=""" + CREATE EXTERNAL TABLE as_entity_map + (donor_id INTEGER, canon_id INTEGER, + cluster_score FLOAT) + ROW FORMAT DELIMITED + FIELDS TERMINATED BY '\t' + LINES TERMINATED BY '\n' + LOCATION + 's3://{}/{}' + TBLPROPERTIES ( + 'classification'='csv', + --'skip.header.line.count'='1', + 'serialization.null.format'='') + """.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map') + athenautils.athena_start_query(q, database=config.DATABASE) + + athenautils.write_many(cluster_ids(clustered_dupes), + filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/entity_map.csv')) + + # Print out the number of duplicates found + print('# duplicate sets') + + # ## Payoff + + # With all this done, we can now begin to ask interesting questions + # of the data + # + # For example, let's see who the top 10 donors are. + + locale.setlocale(locale.LC_ALL, 'en_CA.UTF-8') # for pretty printing numbers + + athenautils.athena_start_query("DROP TABLE IF EXISTS as_e_map", database=config.DATABASE) + + q = """ + CREATE TABLE as_e_map as + SELECT COALESCE(canon_id, as_entity_map.donor_id) AS canon_id, as_entity_map.donor_id + FROM as_entity_map + RIGHT JOIN as_donors USING(donor_id) + """ + athenautils.athena_start_query(q, database=config.DATABASE) + + q = """ + SELECT array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name, + donation_totals.totals AS totals + FROM as_donors INNER JOIN + (SELECT canon_id, SUM(cast (amount as double)) AS totals + FROM as_contributions INNER JOIN as_e_map + USING (donor_id) + GROUP BY (canon_id) + ORDER BY totals + DESC LIMIT 10) + AS donation_totals + ON as_donors.donor_id = donation_totals.canon_id + ORDER BY totals DESC + """ + cur = dict_cursor_execute(q, database=config.DATABASE) + + print("Top Donors (deduped)") + for row in cur: + row['totals'] = locale.currency(row['totals'], grouping=True) + print('%(totals)20s: %(name)s' % row) + + # Compare this to what we would have gotten if we hadn't done any + # deduplication + q = """ + with donorscontributions as( + + SELECT as_donors.donor_id, + array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name, + cast(as_contributions.amount as double) as amount + FROM as_donors INNER JOIN as_contributions + USING (donor_id) + ) + SELECT name, sum(amount) AS totals + FROM donorscontributions + GROUP BY donor_id, name + ORDER BY totals DESC + LIMIT 10 + """ + cur = dict_cursor_execute(q, database=config.DATABASE) + + print("Top Donors (raw)") + for row in cur: + row['totals'] = locale.currency(row['totals'], grouping=True) + print('%(totals)20s: %(name)s' % row) + + print('ran in', time.time() - start_time, 'seconds') diff --git a/athena_example/utils.py b/athena_example/utils.py new file mode 100644 index 00000000..1b8b935a --- /dev/null +++ b/athena_example/utils.py @@ -0,0 +1,139 @@ +from __future__ import print_function +import re +import boto3 +import botocore +import sys +import datetime +import os +import time +import pandas as pd +from six import string_types +import sys +pyver = sys.version_info[0] + +if pyver<3: + from StringIO import StringIO as SomethingIO + from urlparse import urlparse +else: + from io import BytesIO as SomethingIO + from urllib.parse import urlparse + +sys.path.insert(0, '../athena_example/') +import config + +s3 = boto3.client('s3', region_name=config.REGION, + aws_access_key_id=config.ACCESS_KEY_ID, aws_secret_access_key=config.SECRET_ACCESS_KEY) + +athena = boto3.client('athena', region_name=config.REGION, + aws_access_key_id=config.ACCESS_KEY_ID, aws_secret_access_key=config.SECRET_ACCESS_KEY) + +def athena_to_panda(query, database=config.DATABASE, output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, **kwargs): + query_execution_id = athena_start_query(query, database, output_location, region, workgroup, wait_until_finished=True) + df = pandas_read_csv(os.path.join(output_location, query_execution_id+'.csv'), **kwargs) + return df + + +def athena_start_query(query, database=config.DATABASE, output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, wait_until_finished=True): + query_execution_id = athena.start_query_execution( + QueryString=query, + QueryExecutionContext={ + 'Database': database + }, + WorkGroup=workgroup, + ResultConfiguration={ + "OutputLocation": output_location + } + )['QueryExecutionId'] + + seconds_to_wait = 1 + + if wait_until_finished: + while True: + time.sleep(seconds_to_wait) + seconds_to_wait += 1 +# seconds_to_wait *= 2 + + execution = athena.get_query_execution( + QueryExecutionId=query_execution_id + ) + + if execution['QueryExecution']['Status']['State'] not in ['QUEUED', 'RUNNING']: + break + + if execution['QueryExecution']['Status']['State'] != 'SUCCEEDED': + raise Exception("Athena query failed: %s" % ( execution['QueryExecution']['Status']['StateChangeReason'],), query_execution_id) + + return query_execution_id + +# Copied from https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py +# Import it instead, when it's updated. +def is_s3_url(url): + """Check for an s3, s3n, or s3a url""" + try: + return urlparse(url).scheme in ["s3", "s3n", "s3a"] + except Exception: + return False + +def seperate_bucket_key(url): + m = re.match('s3://([^/]+)/(.*)', url) + return m.group(1), m.group(2) + +def list_all(path): + if is_s3_url(path): + bucket, key = seperate_bucket_key(path) + objects = s3.list_objects_v2(Bucket=bucket, Prefix=key) + return [key['Key'] for key in objects['Contents']] + from os import listdir + from os.path import isfile, join + return listdir(path) + + +def pandas_read_csv(filepath_or_buffer, verbose=True, **kwargs): + bucket, key = seperate_bucket_key(filepath_or_buffer) + obj = s3.get_object(Bucket=bucket, Key=key) + return pd.read_csv(SomethingIO(obj['Body'].read()), **kwargs) + +def read(filename, verbose=True): + log ("Reading {}".format(filename), verbose=verbose) + if is_s3_url(filename): + bucket, key = seperate_bucket_key(filename) + obj=s3.get_object(Bucket=bucket, Key=key) + return obj['Body'].read() + with open (filename) as f: + return f.read() + +def write(body, filename): + bucket, key = seperate_bucket_key(filename) + s3.put_object(Bucket=bucket, Key=key, Body=body) + return + + +def file_exists(filename): + bucket, key = seperate_bucket_key(filename) + try: + s3.get_object(Bucket=bucket, Key=key) + except botocore.exceptions.ClientError as e: + if e.response['Error']['Code']=='NoSuchKey': + return False + else: + # Something else has gone wrong. + raise + else: + return True + + +def log(outstr, logfile_name=config.LOG_FILE, timestamped=True, verbose=True, quiet=False): + if verbose == False: + return + if timestamped: + outstr = "[%s]\t%s\n" % (str(datetime.datetime.now()) , outstr) + else: + outstr = "%s\n" % (outstr,) + + with open(logfile_name, "a") as logfile: + logfile.write(outstr) + + if not quiet: + sys.stdout.write(outstr); + sys.stdout.flush() +# Print iterations progress diff --git a/notebooks/athena_example.ipynb b/notebooks/athena_example.ipynb deleted file mode 100644 index 69edb207..00000000 --- a/notebooks/athena_example.ipynb +++ /dev/null @@ -1,452 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Overwriting ../athena_example/athena_example.py\n" - ] - } - ], - "source": [ - "%%writefile ../athena_example/athena_example.py\n", - "\n", - "\"\"\"\n", - "This is an example of working with very large data. There are about\n", - "700,000 unduplicated donors in this database of Illinois political\n", - "campaign contributions.\n", - "\n", - "With such a large set of input data, we cannot store all the comparisons\n", - "we need to make in memory. Instead, we will read the pairs on demand\n", - "from the Athena database.\n", - "\n", - "__Note:__ You will need to run `python mysql_init_db.py`\n", - "before running this script. See the annotates source for\n", - "[mysql_init_db.py](mysql_init_db.html)\n", - "\n", - "For smaller datasets (<10,000), see our\n", - "[csv_example](csv_example.html)\n", - "\"\"\"\n", - "\n", - "# There is a little bit difference between the result \n", - "# of this module and the mysql one. The reason is due to\n", - "# Some special (and mostly erroneous) characters, such as \\a .. \n", - "# Which are dealt with differently by mysql and athena/panda\n", - "\n", - "import sys\n", - "import os\n", - "import itertools\n", - "import time\n", - "import logging\n", - "import optparse\n", - "import locale\n", - "import json\n", - "from io import StringIO\n", - "import csv\n", - "import pandas as pd\n", - "\n", - "import boto3\n", - "import dedupe\n", - "import dedupe.backport\n", - "sys.path.insert(0, '../athena_example/')\n", - "import config\n", - "sys.path.insert(0, '../athena_example/')\n", - "import athenautils\n", - "\n", - "def cursor_execute(query, database):\n", - " '''\n", - " The MySQL compatible Cursor\n", - " '''\n", - " return athenautils.cursor_execute(query, database=database, \n", - " cursortype='tuple', buffersize=config.BUFFERSIZE,\n", - " escapechar=None, keep_default_na=False, na_values=[''])\n", - "\n", - "def dict_cursor_execute(query, database):\n", - " '''\n", - " The MySQL compatible DicCursor\n", - " '''\n", - " return athenautils.cursor_execute(query, database=database, \n", - " cursortype='dict', buffersize=config.BUFFERSIZE,\n", - " escapechar=None, keep_default_na=False, na_values=[''])\n", - "def record_pairs(result_set):\n", - " for i, row in enumerate(result_set):\n", - " a_record_id, a_record, b_record_id, b_record = row\n", - " record_a = (a_record_id, json.loads(a_record))\n", - " record_b = (b_record_id, json.loads(b_record))\n", - "\n", - " yield record_a, record_b\n", - "\n", - " if i % 10000 == 0:\n", - " print(i)\n", - "\n", - "\n", - "def cluster_ids(clustered_dupes):\n", - "\n", - " for cluster, scores in clustered_dupes:\n", - " cluster_id = cluster[0]\n", - " for donor_id, score in zip(cluster, scores):\n", - " yield donor_id, cluster_id, score\n", - "\n", - "\n", - "if __name__ == '__main__':\n", - "\n", - " ## Logging\n", - "\n", - " # Dedupe uses Python logging to show or suppress verbose output. Added\n", - " # for convenience. To enable verbose output, run `python\n", - " # examples/mysql_example/mysql_example.py -v`\n", - " \n", - " optp = optparse.OptionParser()\n", - " optp.add_option('-v', '--verbose', dest='verbose', action='count',\n", - " help='Increase verbosity (specify multiple times for more)'\n", - " )\n", - " (opts, args) = optp.parse_args()\n", - " log_level = logging.WARNING\n", - " if opts.verbose:\n", - " if opts.verbose == 1:\n", - " log_level = logging.INFO\n", - " elif opts.verbose >= 2:\n", - " log_level = logging.DEBUG\n", - "\n", - "\n", - " logging.getLogger().setLevel(log_level)\n", - "\n", - " \n", - "\n", - "\n", - " settings_file = 'mysql_example_settings'\n", - " training_file = 'mysql_example_training.json'\n", - "\n", - " start_time = time.time()\n", - "\n", - " # We'll be using variations on this following select statement to pull\n", - " # in campaign donor info.\n", - " #\n", - " # We did a fair amount of preprocessing of the fields in\n", - " # `mysql_init_db.py` \n", - " DONOR_SELECT = \"\"\"SELECT donor_id, city, name, zip, state, address\n", - " from as_processed_donors\"\"\"\n", - "\n", - " # ## Training\n", - "\n", - " if os.path.exists(settings_file):\n", - " print('reading from ', settings_file)\n", - " with open(settings_file, 'rb') as sf:\n", - " deduper = dedupe.StaticDedupe(sf, num_cores=4)\n", - " else:\n", - " # Define the fields dedupe will pay attention to\n", - " #\n", - " # The address, city, and zip fields are often missing, so we'll\n", - " # tell dedupe that, and we'll learn a model that take that into\n", - " # account\n", - " fields = [{'field': 'name', 'type': 'String'},\n", - " {'field': 'address', 'type': 'String',\n", - " 'has missing': True},\n", - " {'field': 'city', 'type': 'ShortString', 'has missing': True},\n", - " {'field': 'state', 'type': 'ShortString', 'has missing': True},\n", - " {'field': 'zip', 'type': 'ShortString', 'has missing': True},\n", - " ]\n", - "\n", - " # Create a new deduper object and pass our data model to it.\n", - " deduper = dedupe.Dedupe(fields, num_cores=4)\n", - "\n", - " # We will sample pairs from the entire donor table for training\n", - " cur = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE)\n", - " temp_d = {i: row for i, row in enumerate(cur)}\n", - " \n", - "\n", - " # If we have training data saved from a previous run of dedupe,\n", - " # look for it an load it in.\n", - " #\n", - " # __Note:__ if you want to train from\n", - " # scratch, delete the training_file\n", - " if os.path.exists(training_file):\n", - " print('reading labeled examples from ', training_file)\n", - " with open(training_file) as tf:\n", - " deduper.prepare_training(temp_d, training_file=tf)\n", - " else:\n", - " deduper.prepare_training(temp_d)\n", - "\n", - " del temp_d\n", - "\n", - " # ## Active learning\n", - "\n", - " print('starting active labeling...')\n", - " # Starts the training loop. Dedupe will find the next pair of records\n", - " # it is least certain about and ask you to label them as duplicates\n", - " # or not.\n", - "\n", - " # use 'y', 'n' and 'u' keys to flag duplicates\n", - " # press 'f' when you are finished\n", - " dedupe.convenience.console_label(deduper)\n", - " # When finished, save our labeled, training pairs to disk\n", - " with open(training_file, 'w') as tf:\n", - " deduper.write_training(tf)\n", - "\n", - " # Notice our the argument here\n", - " #\n", - " # `recall` is the proportion of true dupes pairs that the learned\n", - " # rules must cover. You may want to reduce this if your are making\n", - " # too many blocks and too many comparisons.\n", - " deduper.train(recall=0.90)\n", - "\n", - " with open(settings_file, 'wb') as sf:\n", - " deduper.write_settings(sf)\n", - "\n", - " # We can now remove some of the memory hobbing objects we used\n", - " # for training\n", - " deduper.cleanup_training()\n", - "\n", - " # ## Blocking\n", - "\n", - " print('blocking...')\n", - "\n", - " # To run blocking on such a large set of data, we create a separate table\n", - " # that contains blocking keys and record ids\n", - " print('creating as_blocking_map database')\n", - " athenautils.drop_external_table(\"as_blocking_map\", \n", - " location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map'),\n", - " database=config.DATABASE)\n", - "\n", - " q=\"\"\"\n", - " CREATE EXTERNAL TABLE as_blocking_map \n", - " (block_key VARCHAR(200), donor_id INTEGER)\n", - " ROW FORMAT DELIMITED\n", - " FIELDS TERMINATED BY '\\t'\n", - " LINES TERMINATED BY '\\n' \n", - " LOCATION\n", - " 's3://{}/{}' \n", - " TBLPROPERTIES (\n", - " 'classification'='csv', \n", - " --'skip.header.line.count'='1', \n", - " 'serialization.null.format'='')\n", - " \"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map') \n", - " athenautils.athena_start_query(q, database=config.DATABASE)\n", - "\n", - " # If dedupe learned a Index Predicate, we have to take a pass\n", - " # through the data and create indices.\n", - " print('creating inverted index')\n", - "\n", - " # Armin: \n", - " # This never runs, index_fields is empty, possible bug?\n", - " for field in deduper.fingerprinter.index_fields:\n", - " q = \"\"\"\n", - " SELECT DISTINCT {field} FROM as_processed_donors\n", - " WHERE {field} IS NOT NULL\n", - " \"\"\".format(field=field)\n", - " cur = dict_cursor_execute(q, databse=config.DATABASE)\n", - " field_data = (row[field] for row in cur)\n", - " deduper.fingerprinter.index(field_data, field)\n", - " \n", - "\n", - " # Now we are ready to write our blocking map table by creating a\n", - " # generator that yields unique `(block_key, donor_id)` tuples.\n", - " print('writing blocking map')\n", - " \n", - " read_cur = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE)\n", - " full_data = ((row['donor_id'], row) for row in read_cur)\n", - "\n", - " b_data = deduper.fingerprinter(full_data)\n", - " athenautils.write_many(b_data, \n", - " filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map/blocking.csv'))\n", - "\n", - "\n", - " # select unique pairs to compare\n", - " q=\"\"\"\n", - " SELECT a.donor_id,\n", - " json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'],\n", - " ARRAY[ a.city, a.name, a.zip, a.state, a.address])\n", - " AS JSON)),\n", - " b.donor_id,\n", - " json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], \n", - " ARRAY[ b.city, b.name, b.zip, b.state, b.address])\n", - " AS JSON))\n", - " FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west\n", - " from as_blocking_map as l\n", - " INNER JOIN as_blocking_map as r\n", - " using (block_key)\n", - " where l.donor_id < r.donor_id) ids\n", - " INNER JOIN as_processed_donors a on ids.east=a.donor_id\n", - " INNER JOIN as_processed_donors b on ids.west=b.donor_id\n", - " \"\"\"\n", - " read_cur = cursor_execute(q, database=config.DATABASE)\n", - "\n", - "\n", - " # ## Clustering\n", - "\n", - " print('clustering...')\n", - " clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)),\n", - " threshold=0.5)\n", - "\n", - "# athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_entity_map\", database=config.DATABASE)\n", - " athenautils.drop_external_table(\"as_entity_map\", \n", - " location='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/'), \n", - " database=config.DATABASE)\n", - " \n", - " print('creating as_entity_map database')\n", - " q=\"\"\"\n", - " CREATE EXTERNAL TABLE as_entity_map \n", - " (donor_id INTEGER, canon_id INTEGER, \n", - " cluster_score FLOAT)\n", - " ROW FORMAT DELIMITED\n", - " FIELDS TERMINATED BY '\\t'\n", - " LINES TERMINATED BY '\\n' \n", - " LOCATION\n", - " 's3://{}/{}' \n", - " TBLPROPERTIES (\n", - " 'classification'='csv', \n", - " --'skip.header.line.count'='1', \n", - " 'serialization.null.format'='')\n", - " \"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map') \n", - " athenautils.athena_start_query(q, database=config.DATABASE) \n", - "\n", - " athenautils.write_many(cluster_ids(clustered_dupes),\n", - " filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/entity_map.csv'))\n", - "\n", - " # Print out the number of duplicates found\n", - " print('# duplicate sets')\n", - "\n", - " # ## Payoff\n", - "\n", - " # With all this done, we can now begin to ask interesting questions\n", - " # of the data\n", - " #\n", - " # For example, let's see who the top 10 donors are.\n", - "\n", - " locale.setlocale(locale.LC_ALL, 'en_CA.UTF-8') # for pretty printing numbers\n", - " \n", - " athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_e_map\", database=config.DATABASE)\n", - " \n", - " q = \"\"\"\n", - " CREATE TABLE as_e_map as \n", - " SELECT COALESCE(canon_id, as_entity_map.donor_id) AS canon_id, as_entity_map.donor_id \n", - " FROM as_entity_map \n", - " RIGHT JOIN as_donors USING(donor_id) \n", - " \"\"\" \n", - " athenautils.athena_start_query(q, database=config.DATABASE)\n", - " \n", - " q = \"\"\"\n", - " SELECT array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name, \n", - " donation_totals.totals AS totals \n", - " FROM as_donors INNER JOIN \n", - " (SELECT canon_id, SUM(cast (amount as double)) AS totals \n", - " FROM as_contributions INNER JOIN as_e_map \n", - " USING (donor_id) \n", - " GROUP BY (canon_id) \n", - " ORDER BY totals \n", - " DESC LIMIT 10) \n", - " AS donation_totals \n", - " ON as_donors.donor_id = donation_totals.canon_id\n", - " ORDER BY totals DESC\n", - " \"\"\"\n", - " cur = dict_cursor_execute(q, database=config.DATABASE)\n", - "\n", - " print(\"Top Donors (deduped)\")\n", - " for row in cur:\n", - " row['totals'] = locale.currency(row['totals'], grouping=True)\n", - " print('%(totals)20s: %(name)s' % row)\n", - "\n", - " # Compare this to what we would have gotten if we hadn't done any\n", - " # deduplication\n", - " q = \"\"\"\n", - " with donorscontributions as(\n", - "\n", - " SELECT as_donors.donor_id, \n", - " array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,\n", - " cast(as_contributions.amount as double) as amount\n", - " FROM as_donors INNER JOIN as_contributions \n", - " USING (donor_id) \n", - " )\n", - " SELECT name, sum(amount) AS totals \n", - " FROM donorscontributions\n", - " GROUP BY donor_id, name\n", - " ORDER BY totals DESC \n", - " LIMIT 10\n", - " \"\"\"\n", - " cur = dict_cursor_execute(q, database=config.DATABASE)\n", - "\n", - " print(\"Top Donors (raw)\")\n", - " for row in cur:\n", - " row['totals'] = locale.currency(row['totals'], grouping=True)\n", - " print('%(totals)20s: %(name)s' % row)\n", - "\n", - " print('ran in', time.time() - start_time, 'seconds')\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "^C\r\n", - "Traceback (most recent call last):\r\n", - " File \"../athena_example/athena_example.py\", line 156, in \r\n", - " deduper.prepare_training(temp_d)\r\n", - " File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/api.py\", line 1249, in prepare_training\r\n", - " self._sample(data, sample_size, blocked_proportion, original_length)\r\n", - " File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/api.py\", line 1287, in _sample\r\n", - " index_include=examples)\r\n", - " File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/labeler.py\", line 418, in __init__\r\n", - " index_include)\r\n", - " File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/labeler.py\", line 246, in __init__\r\n", - " index_data)\r\n", - " File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/training.py\", line 128, in __init__\r\n", - " simple_cover = self.coveredPairs(self.blocker, sampled_records)\r\n", - " File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/training.py\", line 156, in coveredPairs\r\n", - " for block in pred_cover.values()\r\n", - " File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/training.py\", line 157, in \r\n", - " for pair in itertools.combinations(sorted(block), 2)}\r\n", - "KeyboardInterrupt\r\n" - ] - } - ], - "source": [ - "!python ../athena_example/athena_example.py" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "conda_python3", - "language": "python", - "name": "conda_python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.10" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "state": {}, - "version_major": 2, - "version_minor": 0 - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/athena_init_db.ipynb b/notebooks/athena_init_db.ipynb deleted file mode 100644 index d35250de..00000000 --- a/notebooks/athena_init_db.ipynb +++ /dev/null @@ -1,277 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# !pip install dedupe" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%writefile ../athena_example/config.py\n", - "LOG_FILE = 'log.txt'\n", - "\n", - "# Connection parameters\n", - "ACCESS_KEY_ID = None\n", - "SECRET_ACCESS_KEY = None\n", - "ATHENA_GARBAGE_PATH = 's3://aws-athena-query-results-rds'\n", - "WORKGROUP = 'RDS'\n", - "REGION = 'eu-west-1'\n", - "DATABASE = 'ria_tmp'\n", - "\n", - "# Database Parameters\n", - "DATABASE_BUCKET = 'ria-temp'\n", - "DATABASE_ROOT_KEY = 'as_dedupe/'\n", - "BUFFERSIZE = 100000" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%writefile ../athena_example/athena_init.py\n", - "#!/usr/bin/python\n", - "\"\"\"\n", - "This is a setup script for athena_example. It downloads a zip file of\n", - "Illinois campaign contributions and loads them into a Athena database\n", - "named 'contributions'.\n", - " \n", - "__Note:__ You will need to run this script first before execuing\n", - "[athena_example.py](athena_example.py).\n", - " \n", - "Tables created:\n", - "* as_raw_table - raw import of entire CSV file\n", - "* donors - all distinct donors based on name and address\n", - "* recipients - all distinct campaign contribution recipients\n", - "* contributions - contribution amounts tied to donor and recipients tables\n", - "\"\"\"\n", - "\n", - "import os\n", - "import zipfile\n", - "import warnings\n", - "import pandas as pd\n", - "import numpy as np\n", - "from urllib.request import urlopen\n", - "import boto3\n", - "import config\n", - "import csv\n", - "import sys\n", - "sys.path.insert(0, '../athena_example/')\n", - "import athenautils\n", - "\n", - "\n", - "contributions_zip_file = 'Illinois-campaign-contributions.txt.zip'\n", - "contributions_txt_file = 'Illinois-campaign-contributions.txt'\n", - "\n", - "if not os.path.exists(contributions_zip_file) :\n", - " print('downloading', contributions_zip_file, '(~60mb) ...')\n", - " u = urlopen('https://s3.amazonaws.com/dedupe-data/Illinois-campaign-contributions.txt.zip')\n", - " localFile = open(contributions_zip_file, 'wb')\n", - " localFile.write(u.read())\n", - " localFile.close()\n", - "\n", - "if not os.path.exists(contributions_txt_file) :\n", - " zip_file = zipfile.ZipFile(contributions_zip_file, 'r')\n", - " print('extracting %s' % contributions_zip_file)\n", - " zip_file_contents = zip_file.namelist()\n", - " for f in zip_file_contents:\n", - " if ('.txt' in f):\n", - " zip_file.extract(f)\n", - " zip_file.close()\n", - "\n", - "\n", - "\n", - "\n", - "print('importing raw data from csv...')\n", - "athenautils.drop_external_table(\"as_raw_table\", \n", - " location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table'),\n", - " database=config.DATABASE) \n", - "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_donors\", database=config.DATABASE)\n", - "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_recipients\", database=config.DATABASE)\n", - "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_contributions\", database=config.DATABASE)\n", - "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_processed_donors\", database=config.DATABASE)\n", - "\n", - "\n", - "q=r\"\"\"\n", - "CREATE EXTERNAL TABLE as_raw_table \n", - " (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), \n", - " address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), \n", - " state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), \n", - " date_recieved VARCHAR(10), loan_amount VARCHAR(12), \n", - " amount VARCHAR(23), receipt_type VARCHAR(23), \n", - " employer VARCHAR(70), occupation VARCHAR(40), \n", - " vendor_last_name VARCHAR(70), vendor_first_name VARCHAR(20), \n", - " vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), \n", - " vendor_city VARCHAR(20), vendor_state VARCHAR(10), \n", - " vendor_zip VARCHAR(10), description VARCHAR(90), \n", - " election_type VARCHAR(10), election_year VARCHAR(10), \n", - " report_period_begin VARCHAR(10), report_period_end VARCHAR(33), \n", - " committee_name VARCHAR(70), committee_id VARCHAR(37)) \n", - "ROW FORMAT DELIMITED\n", - " FIELDS TERMINATED BY '\\t'\n", - " ESCAPED BY '\\\\'\n", - " LINES TERMINATED BY '\\n' \n", - "LOCATION\n", - " 's3://{}/{}' \n", - "TBLPROPERTIES (\n", - " 'classification'='csv', \n", - " 'skip.header.line.count'='1', \n", - " 'serialization.null.format'='')\n", - "\"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table') \n", - "athenautils.athena_start_query(q, database=config.DATABASE)\n", - "\n", - "\n", - "df_cursor = pd.read_csv(contributions_txt_file, sep='\\t', escapechar='\\\\', quoting=csv.QUOTE_NONE, \n", - " error_bad_lines=False, warn_bad_lines=True, dtype=str, keep_default_na=False, na_values=[''],\n", - " chunksize=config.BUFFERSIZE)\n", - "chunkcount = 0\n", - "filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'as_raw_table', os.path.splitext(contributions_txt_file)[0]+'.csv')\n", - "for df in df_cursor: \n", - " # Remove the very few records that mess up the demo \n", - " # (demo purposes only! Don't do something like this in production)\n", - " df = df[df['RcvDate'].str.len()>=10]\n", - "\n", - " # set empty, non-zero, strings in date columns to null\n", - " df.loc[df['RptPdBegDate'].str.len()<10,'RptPdBegDate'] = np.nan\n", - "\n", - " df.loc[df['RptPdEndDate'].str.len()<10,'RptPdEndDate'] = np.nan\n", - "\n", - " #committee ID is requred. Remove the 2 rows that don't have it.\n", - " df = df[df['ID']!='']\n", - "\n", - " # There's a record with a date stuck in the committee_id column, which causes\n", - " # problems when inserting into the contributions table below. Get rid of it this \n", - " # way.\n", - " df = df[df['ID'].str.len() <=9]\n", - "\n", - " # dropping the last columns\n", - " df = df.drop(columns='Unnamed: 29')\n", - "\n", - " df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand')\n", - " \n", - " buffer = df_lower.to_csv(quoting=csv.QUOTE_NONE, sep=\"\\t\", escapechar='\\\\', index=None)\n", - " \n", - " chunk_fname = athenautils.file_name_append(filename, '_{}'.format(chunkcount), ommitext=False)\n", - " athenautils.write(body=buffer, filename=chunk_fname)\n", - " chunkcount += 1 \n", - " \n", - "print('creating donors table...')\n", - "q=\"\"\"\n", - "CREATE TABLE as_donors as\n", - " with tmp as\n", - " (SELECT DISTINCT \n", - " NULLIF(TRIM(last_name), '') as last_name, \n", - " NULLIF(TRIM(first_name), '') as first_name, \n", - " NULLIF(TRIM(address_1), '') as address_1, \n", - " NULLIF(TRIM(address_2), '') as address_2, \n", - " NULLIF(TRIM(city), '') city, \n", - " NULLIF(TRIM(state), '') as state, \n", - " NULLIF(TRIM(zip), '') as zip, \n", - " NULLIF(TRIM(employer), '') as employer, \n", - " NULLIF(TRIM(occupation), '') as occupation\n", - " FROM as_raw_table)\n", - " SELECT row_number() over () as donor_id, * from tmp\"\"\"\n", - "athenautils.athena_start_query(q, database=config.DATABASE)\n", - "\n", - "\n", - "q=\"\"\"\n", - "CREATE TABLE as_recipients as\n", - " SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM as_raw_table\n", - "\"\"\"\n", - "athenautils.athena_start_query(q, database=config.DATABASE)\n", - "\n", - "print('creating contributions table')\n", - "\n", - "q=\"\"\"\n", - "CREATE TABLE as_contributions as\n", - " SELECT reciept_id as contribution_id, \n", - " donors.donor_id as donor_id , \n", - " committee_id as recipient_id, \n", - " report_type, date_parse(date_recieved, '%m/%d/%Y') as date_recieved, \n", - " loan_amount, amount, \n", - " receipt_type, vendor_last_name , \n", - " vendor_first_name, vendor_address_1, vendor_address_2, \n", - " vendor_city, vendor_state, vendor_zip, description, \n", - " election_type, election_year, \n", - " date_parse(report_period_begin, '%m/%d/%Y') as report_period_begin, \n", - " date_parse(report_period_end, '%m/%d/%Y') as report_period_end \n", - " FROM as_raw_table JOIN as_donors donors ON \n", - " coalesce(donors.first_name, '') = coalesce(TRIM(as_raw_table.first_name), '') AND \n", - " coalesce(donors.last_name, '') = coalesce(TRIM(as_raw_table.last_name), '') AND \n", - " coalesce(donors.address_1, '') = coalesce(TRIM(as_raw_table.address_1), '') AND \n", - " coalesce(donors.address_2, '') = coalesce(TRIM(as_raw_table.address_2), '') AND \n", - " coalesce(donors.city, '') = coalesce(TRIM(as_raw_table.city), '') AND \n", - " coalesce(donors.state, '') = coalesce(TRIM(as_raw_table.state), '') AND \n", - " coalesce(donors.employer, '') = coalesce(TRIM(as_raw_table.employer), '') AND \n", - " coalesce(donors.occupation , '')= coalesce(TRIM(as_raw_table.occupation), '') AND \n", - " coalesce(donors.zip, '') = coalesce(TRIM(as_raw_table.zip), '')\"\"\"\n", - "\n", - "athenautils.athena_start_query(q, database=config.DATABASE)\n", - "\n", - "q = \"\"\"\n", - "CREATE TABLE as_processed_donors AS \n", - " SELECT donor_id, \n", - " LOWER(city) AS city, \n", - " CASE WHEN (first_name IS NULL AND last_name IS NULL) \n", - " THEN NULL \n", - " ELSE LOWER(array_join(filter(array[first_name, last_name], x-> x IS NOT NULL), ' ')) \n", - " END AS name, \n", - " LOWER(zip) AS zip, \n", - " LOWER(state) AS state, \n", - " CASE WHEN (address_1 IS NULL AND address_2 IS NULL) \n", - " THEN NULL \n", - " ELSE LOWER(array_join(filter(array[address_1, address_2], x-> x IS NOT NULL), ' '))\n", - " END AS address, \n", - " LOWER(occupation) AS occupation, \n", - " LOWER(employer) AS employer, \n", - " first_name is null AS person \n", - " FROM as_donors\"\"\"\n", - "athenautils.athena_start_query(q, database=config.DATABASE)\n", - "\n", - "\n", - "\n", - "\n", - "print('done')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!python ../athena_example/athena_init.py" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "conda_python3", - "language": "python", - "name": "conda_python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.10" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} From 3e1bb9657134c12bee1e41e056c87c2d9c27993d Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Thu, 5 Nov 2020 14:18:56 +0000 Subject: [PATCH 13/19] solving memeory issue --- notebooks/athena_example.ipynb | 418 +++++++++++++++++++++++++++++++++ notebooks/athena_init_db.ipynb | 277 ++++++++++++++++++++++ 2 files changed, 695 insertions(+) create mode 100644 notebooks/athena_example.ipynb create mode 100644 notebooks/athena_init_db.ipynb diff --git a/notebooks/athena_example.ipynb b/notebooks/athena_example.ipynb new file mode 100644 index 00000000..089fb9e0 --- /dev/null +++ b/notebooks/athena_example.ipynb @@ -0,0 +1,418 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile ../athena_example/athena_example.py\n", + "\n", + "\"\"\"\n", + "This is an example of working with very large data. There are about\n", + "700,000 unduplicated donors in this database of Illinois political\n", + "campaign contributions.\n", + "\n", + "With such a large set of input data, we cannot store all the comparisons\n", + "we need to make in memory. Instead, we will read the pairs on demand\n", + "from the Athena database.\n", + "\n", + "__Note:__ You will need to run `python athena_init_db.py`\n", + "before running this script. See the annotates source for\n", + "[athena_init_db.py](athena_init_db.html)\n", + "\n", + "For smaller datasets (<10,000), see our\n", + "[csv_example](csv_example.html)\n", + "\"\"\"\n", + "\n", + "# There is a little bit difference between the result \n", + "# of this module and the athena one. The reason is due to\n", + "# Some special (and mostly erroneous) characters, such as \\a .. \n", + "# Which are dealt with differently by athena and athena/panda\n", + "\n", + "import sys\n", + "import os\n", + "import itertools\n", + "import time\n", + "import logging\n", + "import optparse\n", + "import locale\n", + "import json\n", + "from io import StringIO\n", + "import csv\n", + "import pandas as pd\n", + "\n", + "import boto3\n", + "import dedupe\n", + "import dedupe.backport\n", + "sys.path.insert(0, '../athena_example/')\n", + "import config\n", + "sys.path.insert(0, '../athena_example/')\n", + "import athenautils\n", + "\n", + "def cursor_execute(query, database):\n", + " '''\n", + " The MySQL compatible Cursor\n", + " '''\n", + " return athenautils.cursor_execute(query, database=database, \n", + " cursortype='tuple', buffersize=config.BUFFERSIZE,\n", + " escapechar=None, keep_default_na=False, na_values=[''])\n", + "\n", + "def dict_cursor_execute(query, database):\n", + " '''\n", + " The MySQL compatible DicCursor\n", + " '''\n", + " return athenautils.cursor_execute(query, database=database, \n", + " cursortype='dict', buffersize=config.BUFFERSIZE,\n", + " escapechar=None, keep_default_na=False, na_values=[''])\n", + "def record_pairs(result_set):\n", + " for i, row in enumerate(result_set):\n", + " a_record_id, a_record, b_record_id, b_record = row\n", + " record_a = (a_record_id, json.loads(a_record))\n", + " record_b = (b_record_id, json.loads(b_record))\n", + "\n", + " yield record_a, record_b\n", + "\n", + " if i % 10000 == 0:\n", + " print(i)\n", + "\n", + "\n", + "def cluster_ids(clustered_dupes):\n", + "\n", + " for cluster, scores in clustered_dupes:\n", + " cluster_id = cluster[0]\n", + " for donor_id, score in zip(cluster, scores):\n", + " yield donor_id, cluster_id, score\n", + "\n", + "\n", + "if __name__ == '__main__':\n", + "\n", + " ## Logging\n", + "\n", + " # Dedupe uses Python logging to show or suppress verbose output. Added\n", + " # for convenience. To enable verbose output, run `python\n", + " # examples/athena_example/athena_example.py -v`\n", + " \n", + " optp = optparse.OptionParser()\n", + " optp.add_option('-v', '--verbose', dest='verbose', action='count',\n", + " help='Increase verbosity (specify multiple times for more)'\n", + " )\n", + " (opts, args) = optp.parse_args()\n", + " log_level = logging.WARNING\n", + " if opts.verbose:\n", + " if opts.verbose == 1:\n", + " log_level = logging.INFO\n", + " elif opts.verbose >= 2:\n", + " log_level = logging.DEBUG\n", + "\n", + "\n", + " logging.getLogger().setLevel(log_level)\n", + "\n", + " \n", + "\n", + "\n", + " settings_file = 'athena_example_settings'\n", + " training_file = 'athena_example_training.json'\n", + "\n", + " start_time = time.time()\n", + "\n", + " # We'll be using variations on this following select statement to pull\n", + " # in campaign donor info.\n", + " #\n", + " # We did a fair amount of preprocessing of the fields in\n", + " # `athena_init_db.py` \n", + " DONOR_SELECT = \"\"\"SELECT donor_id, city, name, zip, state, address\n", + " from as_processed_donors\"\"\"\n", + "\n", + " # ## Training\n", + "\n", + " if os.path.exists(settings_file):\n", + " print('reading from ', settings_file)\n", + " with open(settings_file, 'rb') as sf:\n", + " deduper = dedupe.StaticDedupe(sf, num_cores=4)\n", + " else:\n", + " # Define the fields dedupe will pay attention to\n", + " #\n", + " # The address, city, and zip fields are often missing, so we'll\n", + " # tell dedupe that, and we'll learn a model that take that into\n", + " # account\n", + " fields = [{'field': 'name', 'type': 'String'},\n", + " {'field': 'address', 'type': 'String',\n", + " 'has missing': True},\n", + " {'field': 'city', 'type': 'ShortString', 'has missing': True},\n", + " {'field': 'state', 'type': 'ShortString', 'has missing': True},\n", + " {'field': 'zip', 'type': 'ShortString', 'has missing': True},\n", + " ]\n", + "\n", + " # Create a new deduper object and pass our data model to it.\n", + " deduper = dedupe.Dedupe(fields, num_cores=4)\n", + "\n", + " # We will sample pairs from the entire donor table for training\n", + " cur = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE)\n", + " temp_d = {i: row for i, row in enumerate(cur)}\n", + " \n", + "\n", + " # If we have training data saved from a previous run of dedupe,\n", + " # look for it an load it in.\n", + " #\n", + " # __Note:__ if you want to train from\n", + " # scratch, delete the training_file\n", + " if os.path.exists(training_file):\n", + " print('reading labeled examples from ', training_file)\n", + " with open(training_file) as tf:\n", + " deduper.prepare_training(temp_d, training_file=tf)\n", + " else:\n", + " deduper.prepare_training(temp_d)\n", + "\n", + " del temp_d\n", + "\n", + " # ## Active learning\n", + "\n", + " print('starting active labeling...')\n", + " # Starts the training loop. Dedupe will find the next pair of records\n", + " # it is least certain about and ask you to label them as duplicates\n", + " # or not.\n", + "\n", + " # use 'y', 'n' and 'u' keys to flag duplicates\n", + " # press 'f' when you are finished\n", + " dedupe.convenience.console_label(deduper)\n", + " # When finished, save our labeled, training pairs to disk\n", + " with open(training_file, 'w') as tf:\n", + " deduper.write_training(tf)\n", + "\n", + " # Notice our the argument here\n", + " #\n", + " # `recall` is the proportion of true dupes pairs that the learned\n", + " # rules must cover. You may want to reduce this if your are making\n", + " # too many blocks and too many comparisons.\n", + " deduper.train(recall=0.90)\n", + "\n", + " with open(settings_file, 'wb') as sf:\n", + " deduper.write_settings(sf)\n", + "\n", + " # We can now remove some of the memory hobbing objects we used\n", + " # for training\n", + " deduper.cleanup_training()\n", + "\n", + " # ## Blocking\n", + "\n", + " print('blocking...')\n", + "\n", + " # To run blocking on such a large set of data, we create a separate table\n", + " # that contains blocking keys and record ids\n", + " print('creating as_blocking_map database')\n", + " athenautils.drop_external_table(\"as_blocking_map\", \n", + " location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map'),\n", + " database=config.DATABASE)\n", + "\n", + " q=\"\"\"\n", + " CREATE EXTERNAL TABLE as_blocking_map \n", + " (block_key VARCHAR(200), donor_id INTEGER)\n", + " ROW FORMAT DELIMITED\n", + " FIELDS TERMINATED BY '\\t'\n", + " LINES TERMINATED BY '\\n' \n", + " LOCATION\n", + " 's3://{}/{}' \n", + " TBLPROPERTIES (\n", + " 'classification'='csv', \n", + " --'skip.header.line.count'='1', \n", + " 'serialization.null.format'='')\n", + " \"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map') \n", + " athenautils.athena_start_query(q, database=config.DATABASE)\n", + "\n", + " # If dedupe learned a Index Predicate, we have to take a pass\n", + " # through the data and create indices.\n", + " print('creating inverted index')\n", + "\n", + " # Armin: \n", + " # This never runs, index_fields is empty, possible bug?\n", + " for field in deduper.fingerprinter.index_fields:\n", + " q = \"\"\"\n", + " SELECT DISTINCT {field} FROM as_processed_donors\n", + " WHERE {field} IS NOT NULL\n", + " \"\"\".format(field=field)\n", + " cur = dict_cursor_execute(q, databse=config.DATABASE)\n", + " field_data = (row[field] for row in cur)\n", + " deduper.fingerprinter.index(field_data, field)\n", + " \n", + "\n", + " # Now we are ready to write our blocking map table by creating a\n", + " # generator that yields unique `(block_key, donor_id)` tuples.\n", + " print('writing blocking map')\n", + " \n", + " read_cur = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE)\n", + " full_data = ((row['donor_id'], row) for row in read_cur)\n", + "\n", + " b_data = deduper.fingerprinter(full_data)\n", + " athenautils.write_many(b_data, \n", + " filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map/blocking.csv'))\n", + "\n", + "\n", + " # select unique pairs to compare\n", + " q=\"\"\"\n", + " SELECT a.donor_id,\n", + " json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'],\n", + " ARRAY[ a.city, a.name, a.zip, a.state, a.address])\n", + " AS JSON)),\n", + " b.donor_id,\n", + " json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], \n", + " ARRAY[ b.city, b.name, b.zip, b.state, b.address])\n", + " AS JSON))\n", + " FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west\n", + " from as_blocking_map as l\n", + " INNER JOIN as_blocking_map as r\n", + " using (block_key)\n", + " where l.donor_id < r.donor_id) ids\n", + " INNER JOIN as_processed_donors a on ids.east=a.donor_id\n", + " INNER JOIN as_processed_donors b on ids.west=b.donor_id\n", + " \"\"\"\n", + " read_cur = cursor_execute(q, database=config.DATABASE)\n", + "\n", + "\n", + " # ## Clustering\n", + "\n", + " print('clustering...')\n", + " clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)),\n", + " threshold=0.5)\n", + "\n", + "# athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_entity_map\", database=config.DATABASE)\n", + " athenautils.drop_external_table(\"as_entity_map\", \n", + " location='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/'), \n", + " database=config.DATABASE)\n", + " \n", + " print('creating as_entity_map database')\n", + " q=\"\"\"\n", + " CREATE EXTERNAL TABLE as_entity_map \n", + " (donor_id INTEGER, canon_id INTEGER, \n", + " cluster_score FLOAT)\n", + " ROW FORMAT DELIMITED\n", + " FIELDS TERMINATED BY '\\t'\n", + " LINES TERMINATED BY '\\n' \n", + " LOCATION\n", + " 's3://{}/{}' \n", + " TBLPROPERTIES (\n", + " 'classification'='csv', \n", + " --'skip.header.line.count'='1', \n", + " 'serialization.null.format'='')\n", + " \"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map') \n", + " athenautils.athena_start_query(q, database=config.DATABASE) \n", + "\n", + " athenautils.write_many(cluster_ids(clustered_dupes),\n", + " filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/entity_map.csv'))\n", + "\n", + " # Print out the number of duplicates found\n", + " print('# duplicate sets')\n", + "\n", + " # ## Payoff\n", + "\n", + " # With all this done, we can now begin to ask interesting questions\n", + " # of the data\n", + " #\n", + " # For example, let's see who the top 10 donors are.\n", + "\n", + " locale.setlocale(locale.LC_ALL, 'en_CA.UTF-8') # for pretty printing numbers\n", + " \n", + " athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_e_map\", database=config.DATABASE)\n", + " \n", + " q = \"\"\"\n", + " CREATE TABLE as_e_map as \n", + " SELECT COALESCE(canon_id, as_entity_map.donor_id) AS canon_id, as_entity_map.donor_id \n", + " FROM as_entity_map \n", + " RIGHT JOIN as_donors USING(donor_id) \n", + " \"\"\" \n", + " athenautils.athena_start_query(q, database=config.DATABASE)\n", + " \n", + " q = \"\"\"\n", + " SELECT array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name, \n", + " donation_totals.totals AS totals \n", + " FROM as_donors INNER JOIN \n", + " (SELECT canon_id, SUM(cast (amount as double)) AS totals \n", + " FROM as_contributions INNER JOIN as_e_map \n", + " USING (donor_id) \n", + " GROUP BY (canon_id) \n", + " ORDER BY totals \n", + " DESC LIMIT 10) \n", + " AS donation_totals \n", + " ON as_donors.donor_id = donation_totals.canon_id\n", + " ORDER BY totals DESC\n", + " \"\"\"\n", + " cur = dict_cursor_execute(q, database=config.DATABASE)\n", + "\n", + " print(\"Top Donors (deduped)\")\n", + " for row in cur:\n", + " row['totals'] = locale.currency(row['totals'], grouping=True)\n", + " print('%(totals)20s: %(name)s' % row)\n", + "\n", + " # Compare this to what we would have gotten if we hadn't done any\n", + " # deduplication\n", + " q = \"\"\"\n", + " with donorscontributions as(\n", + "\n", + " SELECT as_donors.donor_id, \n", + " array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,\n", + " cast(as_contributions.amount as double) as amount\n", + " FROM as_donors INNER JOIN as_contributions \n", + " USING (donor_id) \n", + " )\n", + " SELECT name, sum(amount) AS totals \n", + " FROM donorscontributions\n", + " GROUP BY donor_id, name\n", + " ORDER BY totals DESC \n", + " LIMIT 10\n", + " \"\"\"\n", + " cur = dict_cursor_execute(q, database=config.DATABASE)\n", + "\n", + " print(\"Top Donors (raw)\")\n", + " for row in cur:\n", + " row['totals'] = locale.currency(row['totals'], grouping=True)\n", + " print('%(totals)20s: %(name)s' % row)\n", + "\n", + " print('ran in', time.time() - start_time, 'seconds')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python ../athena_example/athena_example.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/athena_init_db.ipynb b/notebooks/athena_init_db.ipynb new file mode 100644 index 00000000..d35250de --- /dev/null +++ b/notebooks/athena_init_db.ipynb @@ -0,0 +1,277 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install dedupe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile ../athena_example/config.py\n", + "LOG_FILE = 'log.txt'\n", + "\n", + "# Connection parameters\n", + "ACCESS_KEY_ID = None\n", + "SECRET_ACCESS_KEY = None\n", + "ATHENA_GARBAGE_PATH = 's3://aws-athena-query-results-rds'\n", + "WORKGROUP = 'RDS'\n", + "REGION = 'eu-west-1'\n", + "DATABASE = 'ria_tmp'\n", + "\n", + "# Database Parameters\n", + "DATABASE_BUCKET = 'ria-temp'\n", + "DATABASE_ROOT_KEY = 'as_dedupe/'\n", + "BUFFERSIZE = 100000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile ../athena_example/athena_init.py\n", + "#!/usr/bin/python\n", + "\"\"\"\n", + "This is a setup script for athena_example. It downloads a zip file of\n", + "Illinois campaign contributions and loads them into a Athena database\n", + "named 'contributions'.\n", + " \n", + "__Note:__ You will need to run this script first before execuing\n", + "[athena_example.py](athena_example.py).\n", + " \n", + "Tables created:\n", + "* as_raw_table - raw import of entire CSV file\n", + "* donors - all distinct donors based on name and address\n", + "* recipients - all distinct campaign contribution recipients\n", + "* contributions - contribution amounts tied to donor and recipients tables\n", + "\"\"\"\n", + "\n", + "import os\n", + "import zipfile\n", + "import warnings\n", + "import pandas as pd\n", + "import numpy as np\n", + "from urllib.request import urlopen\n", + "import boto3\n", + "import config\n", + "import csv\n", + "import sys\n", + "sys.path.insert(0, '../athena_example/')\n", + "import athenautils\n", + "\n", + "\n", + "contributions_zip_file = 'Illinois-campaign-contributions.txt.zip'\n", + "contributions_txt_file = 'Illinois-campaign-contributions.txt'\n", + "\n", + "if not os.path.exists(contributions_zip_file) :\n", + " print('downloading', contributions_zip_file, '(~60mb) ...')\n", + " u = urlopen('https://s3.amazonaws.com/dedupe-data/Illinois-campaign-contributions.txt.zip')\n", + " localFile = open(contributions_zip_file, 'wb')\n", + " localFile.write(u.read())\n", + " localFile.close()\n", + "\n", + "if not os.path.exists(contributions_txt_file) :\n", + " zip_file = zipfile.ZipFile(contributions_zip_file, 'r')\n", + " print('extracting %s' % contributions_zip_file)\n", + " zip_file_contents = zip_file.namelist()\n", + " for f in zip_file_contents:\n", + " if ('.txt' in f):\n", + " zip_file.extract(f)\n", + " zip_file.close()\n", + "\n", + "\n", + "\n", + "\n", + "print('importing raw data from csv...')\n", + "athenautils.drop_external_table(\"as_raw_table\", \n", + " location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table'),\n", + " database=config.DATABASE) \n", + "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_donors\", database=config.DATABASE)\n", + "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_recipients\", database=config.DATABASE)\n", + "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_contributions\", database=config.DATABASE)\n", + "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_processed_donors\", database=config.DATABASE)\n", + "\n", + "\n", + "q=r\"\"\"\n", + "CREATE EXTERNAL TABLE as_raw_table \n", + " (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), \n", + " address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), \n", + " state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), \n", + " date_recieved VARCHAR(10), loan_amount VARCHAR(12), \n", + " amount VARCHAR(23), receipt_type VARCHAR(23), \n", + " employer VARCHAR(70), occupation VARCHAR(40), \n", + " vendor_last_name VARCHAR(70), vendor_first_name VARCHAR(20), \n", + " vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), \n", + " vendor_city VARCHAR(20), vendor_state VARCHAR(10), \n", + " vendor_zip VARCHAR(10), description VARCHAR(90), \n", + " election_type VARCHAR(10), election_year VARCHAR(10), \n", + " report_period_begin VARCHAR(10), report_period_end VARCHAR(33), \n", + " committee_name VARCHAR(70), committee_id VARCHAR(37)) \n", + "ROW FORMAT DELIMITED\n", + " FIELDS TERMINATED BY '\\t'\n", + " ESCAPED BY '\\\\'\n", + " LINES TERMINATED BY '\\n' \n", + "LOCATION\n", + " 's3://{}/{}' \n", + "TBLPROPERTIES (\n", + " 'classification'='csv', \n", + " 'skip.header.line.count'='1', \n", + " 'serialization.null.format'='')\n", + "\"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table') \n", + "athenautils.athena_start_query(q, database=config.DATABASE)\n", + "\n", + "\n", + "df_cursor = pd.read_csv(contributions_txt_file, sep='\\t', escapechar='\\\\', quoting=csv.QUOTE_NONE, \n", + " error_bad_lines=False, warn_bad_lines=True, dtype=str, keep_default_na=False, na_values=[''],\n", + " chunksize=config.BUFFERSIZE)\n", + "chunkcount = 0\n", + "filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'as_raw_table', os.path.splitext(contributions_txt_file)[0]+'.csv')\n", + "for df in df_cursor: \n", + " # Remove the very few records that mess up the demo \n", + " # (demo purposes only! Don't do something like this in production)\n", + " df = df[df['RcvDate'].str.len()>=10]\n", + "\n", + " # set empty, non-zero, strings in date columns to null\n", + " df.loc[df['RptPdBegDate'].str.len()<10,'RptPdBegDate'] = np.nan\n", + "\n", + " df.loc[df['RptPdEndDate'].str.len()<10,'RptPdEndDate'] = np.nan\n", + "\n", + " #committee ID is requred. Remove the 2 rows that don't have it.\n", + " df = df[df['ID']!='']\n", + "\n", + " # There's a record with a date stuck in the committee_id column, which causes\n", + " # problems when inserting into the contributions table below. Get rid of it this \n", + " # way.\n", + " df = df[df['ID'].str.len() <=9]\n", + "\n", + " # dropping the last columns\n", + " df = df.drop(columns='Unnamed: 29')\n", + "\n", + " df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand')\n", + " \n", + " buffer = df_lower.to_csv(quoting=csv.QUOTE_NONE, sep=\"\\t\", escapechar='\\\\', index=None)\n", + " \n", + " chunk_fname = athenautils.file_name_append(filename, '_{}'.format(chunkcount), ommitext=False)\n", + " athenautils.write(body=buffer, filename=chunk_fname)\n", + " chunkcount += 1 \n", + " \n", + "print('creating donors table...')\n", + "q=\"\"\"\n", + "CREATE TABLE as_donors as\n", + " with tmp as\n", + " (SELECT DISTINCT \n", + " NULLIF(TRIM(last_name), '') as last_name, \n", + " NULLIF(TRIM(first_name), '') as first_name, \n", + " NULLIF(TRIM(address_1), '') as address_1, \n", + " NULLIF(TRIM(address_2), '') as address_2, \n", + " NULLIF(TRIM(city), '') city, \n", + " NULLIF(TRIM(state), '') as state, \n", + " NULLIF(TRIM(zip), '') as zip, \n", + " NULLIF(TRIM(employer), '') as employer, \n", + " NULLIF(TRIM(occupation), '') as occupation\n", + " FROM as_raw_table)\n", + " SELECT row_number() over () as donor_id, * from tmp\"\"\"\n", + "athenautils.athena_start_query(q, database=config.DATABASE)\n", + "\n", + "\n", + "q=\"\"\"\n", + "CREATE TABLE as_recipients as\n", + " SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM as_raw_table\n", + "\"\"\"\n", + "athenautils.athena_start_query(q, database=config.DATABASE)\n", + "\n", + "print('creating contributions table')\n", + "\n", + "q=\"\"\"\n", + "CREATE TABLE as_contributions as\n", + " SELECT reciept_id as contribution_id, \n", + " donors.donor_id as donor_id , \n", + " committee_id as recipient_id, \n", + " report_type, date_parse(date_recieved, '%m/%d/%Y') as date_recieved, \n", + " loan_amount, amount, \n", + " receipt_type, vendor_last_name , \n", + " vendor_first_name, vendor_address_1, vendor_address_2, \n", + " vendor_city, vendor_state, vendor_zip, description, \n", + " election_type, election_year, \n", + " date_parse(report_period_begin, '%m/%d/%Y') as report_period_begin, \n", + " date_parse(report_period_end, '%m/%d/%Y') as report_period_end \n", + " FROM as_raw_table JOIN as_donors donors ON \n", + " coalesce(donors.first_name, '') = coalesce(TRIM(as_raw_table.first_name), '') AND \n", + " coalesce(donors.last_name, '') = coalesce(TRIM(as_raw_table.last_name), '') AND \n", + " coalesce(donors.address_1, '') = coalesce(TRIM(as_raw_table.address_1), '') AND \n", + " coalesce(donors.address_2, '') = coalesce(TRIM(as_raw_table.address_2), '') AND \n", + " coalesce(donors.city, '') = coalesce(TRIM(as_raw_table.city), '') AND \n", + " coalesce(donors.state, '') = coalesce(TRIM(as_raw_table.state), '') AND \n", + " coalesce(donors.employer, '') = coalesce(TRIM(as_raw_table.employer), '') AND \n", + " coalesce(donors.occupation , '')= coalesce(TRIM(as_raw_table.occupation), '') AND \n", + " coalesce(donors.zip, '') = coalesce(TRIM(as_raw_table.zip), '')\"\"\"\n", + "\n", + "athenautils.athena_start_query(q, database=config.DATABASE)\n", + "\n", + "q = \"\"\"\n", + "CREATE TABLE as_processed_donors AS \n", + " SELECT donor_id, \n", + " LOWER(city) AS city, \n", + " CASE WHEN (first_name IS NULL AND last_name IS NULL) \n", + " THEN NULL \n", + " ELSE LOWER(array_join(filter(array[first_name, last_name], x-> x IS NOT NULL), ' ')) \n", + " END AS name, \n", + " LOWER(zip) AS zip, \n", + " LOWER(state) AS state, \n", + " CASE WHEN (address_1 IS NULL AND address_2 IS NULL) \n", + " THEN NULL \n", + " ELSE LOWER(array_join(filter(array[address_1, address_2], x-> x IS NOT NULL), ' '))\n", + " END AS address, \n", + " LOWER(occupation) AS occupation, \n", + " LOWER(employer) AS employer, \n", + " first_name is null AS person \n", + " FROM as_donors\"\"\"\n", + "athenautils.athena_start_query(q, database=config.DATABASE)\n", + "\n", + "\n", + "\n", + "\n", + "print('done')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python ../athena_example/athena_init.py" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From b7eedfcbbe591641d043444ceb4c737281d192ac Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Thu, 11 Feb 2021 23:10:35 +0000 Subject: [PATCH 14/19] modifying pandas_read_csv --- athena_example/utils.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/athena_example/utils.py b/athena_example/utils.py index 1b8b935a..80922548 100644 --- a/athena_example/utils.py +++ b/athena_example/utils.py @@ -89,18 +89,17 @@ def list_all(path): def pandas_read_csv(filepath_or_buffer, verbose=True, **kwargs): - bucket, key = seperate_bucket_key(filepath_or_buffer) - obj = s3.get_object(Bucket=bucket, Key=key) - return pd.read_csv(SomethingIO(obj['Body'].read()), **kwargs) + return pd.read_csv(filepath_or_buffer, **kwargs) -def read(filename, verbose=True): +def reader(filename, Range='string', verbose=True): + ''' + Range: look at: https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35 + ''' log ("Reading {}".format(filename), verbose=verbose) if is_s3_url(filename): bucket, key = seperate_bucket_key(filename) - obj=s3.get_object(Bucket=bucket, Key=key) - return obj['Body'].read() - with open (filename) as f: - return f.read() + obj=s3.get_object(Bucket=bucket, Key=key, Range=Range) + return obj['Body'] def write(body, filename): bucket, key = seperate_bucket_key(filename) From e30e10764b07ea80085fce4310df7f86354060bc Mon Sep 17 00:00:00 2001 From: asajadi Date: Sat, 8 May 2021 16:53:53 -0400 Subject: [PATCH 15/19] updating config file, removing notebooks --- README.md | 68 ++++-- athena_example/README.md | 5 +- athena_example/config.py | 12 +- notebooks/athena_example.ipynb | 418 --------------------------------- notebooks/athena_init_db.ipynb | 277 ---------------------- 5 files changed, 61 insertions(+), 719 deletions(-) delete mode 100644 notebooks/athena_example.ipynb delete mode 100644 notebooks/athena_init_db.ipynb diff --git a/README.md b/README.md index 82abad3d..073b0ede 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ # Dedupe Examples -Adding Athena Example scripts for the [dedupe](https://github.com/dedupeio/dedupe), a library that uses machine learning to perform de-duplication and entity resolution quickly on structured data. +Example scripts for the [dedupe](https://github.com/dedupeio/dedupe), a library that uses machine learning to perform de-duplication and entity resolution quickly on structured data. Part of the [Dedupe.io](https://dedupe.io/) cloud service and open source toolset for de-duplicating and finding fuzzy matches in your data. For more details, see the [differences between Dedupe.io and the dedupe library](https://dedupe.io/documentation/should-i-use-dedupeio-or-the-dedupe-python-library.html). -To get the athena examples: +To get these examples: ```bash -git clone https://github.com/asajadi/dedupe-examples.git +git clone https://github.com/dedupeio/dedupe-examples.git cd dedupe-examples ``` @@ -34,28 +34,66 @@ Afterwards, whenever you want to work on dedupe-examples, workon dedupe-examples ``` +### [CSV example](https://dedupeio.github.io/dedupe-examples/docs/csv_example.html) - early childhood locations -### [athena example](https://dedupeio.github.io/dedupe-examples/docs/mysql_example.html) - IL campaign contributions +This example works with a list of early childhood education sites in Chicago from 10 different sources. -Takes a database of IL campaign contribution data, loads it in to a -Athena database, and identifies the unique donors. +```bash +cd csv_example +pip install unidecode +python csv_example.py +``` + (use 'y', 'n' and 'u' keys to flag duplicates for active learning, 'f' when you are finished) + +**To see how you might use dedupe with smallish data, see the [annotated source code for csv_example.py](https://dedupeio.github.io/dedupe-examples/docs/csv_example.html).** -To follow this example you need to +### [Patent example](https://dedupeio.github.io/dedupe-examples/docs/patent_example.html) - patent holders -* Create a Athena database called 'contributions' -* Update `athena_example/config.py` with your Athena credentials -* Install dependencies, `pip install -r requirements.txt` +This example works with Dutch inventors from the PATSTAT international patent data file + +```bash +cd patent_example +pip install unidecode +python patent_example.py +``` + (use 'y', 'n' and 'u' keys to flag duplicates for active learning, 'f' when you are finished) -Once that's all done you can run the example: +### [Record Linkage example](https://dedupeio.github.io/dedupe-examples/docs/record_linkage_example.html) - electronics products +This example links two spreadsheets of electronics products and links up the matching entries. Each dataset individually has no duplicates. ```bash -cd mysql_example -python athena_init_db.py -python athena_example.py +cd record_linkage_example +python record_linkage_example.py ``` - (use 'y', 'n' and 'u' keys to flag duplicates for active learning, 'f' when you are finished) +**To see how you might use dedupe for linking datasets, see the [annotated source code for record_linkage_example.py](https://dedupeio.github.io/dedupe-examples/docs/record_linkage_example.html).** + +### [Gazetteer example](https://dedupeio.github.io/dedupe-examples/docs/gazetteer_example.html) - electronics products +This example links two spreadsheets of electronics products and links up the matching entries using the Gazetteer class + +```bash +cd gazetteer_example.py +python gazetteer_example.py +``` + + +### [MySQL example](https://dedupeio.github.io/dedupe-examples/docs/mysql_example.html) - IL campaign contributions + +See `mysql_example/README.md` for details + +**To see how you might use dedupe with bigish data, see the [annotated source code for mysql_example](https://dedupeio.github.io/dedupe-examples/docs/mysql_example.html).** + + +### [PostgreSQL big dedupe example](https://dedupeio.github.io/dedupe-examples/docs/pgsql_big_dedupe_example.html) - PostgreSQL example on large dataset + +See `pgsql_big_dedupe_example/README.md` for details + +This is the same example as the MySQL IL campaign contributions dataset above, but ported to run on PostgreSQL. + +### Athena example - IL campaign contributions +See `athena_example/README.md` for details +This is the same example as the MySQL IL campaign contributions dataset above, but ported to run on Athena. diff --git a/athena_example/README.md b/athena_example/README.md index 53442a12..7b481322 100644 --- a/athena_example/README.md +++ b/athena_example/README.md @@ -5,14 +5,13 @@ Athena database, and identifies the unique donors. To follow this example you need to -* Create a Athena database called 'contributions' -* Update `athena_example/config.py` with your Athena credentials +* Update `athena_example/config.py` with your Athena credentials, database name and the path to sroe the data * Install dependencies, `pip install -r requirements.txt` Once that's all done you can run the example: ```bash -cd mysql_example +cd athena_example python athena_init.py python athena_example.py ``` diff --git a/athena_example/config.py b/athena_example/config.py index 9808a709..b37da68c 100644 --- a/athena_example/config.py +++ b/athena_example/config.py @@ -3,12 +3,12 @@ # Connection parameters ACCESS_KEY_ID = None SECRET_ACCESS_KEY = None -ATHENA_GARBAGE_PATH = 's3://aws-athena-query-results-rds' -WORKGROUP = 'RDS' -REGION = 'eu-west-1' -DATABASE = 'ria_tmp' +ATHENA_GARBAGE_PATH = '' +WORKGROUP = '' +REGION = '' +DATABASE = '' # Database Parameters -DATABASE_BUCKET = 'ria-temp' -DATABASE_ROOT_KEY = 'as_dedupe/' +DATABASE_BUCKET = '' +DATABASE_ROOT_KEY = 'dedupe/' BUFFERSIZE = 100000 diff --git a/notebooks/athena_example.ipynb b/notebooks/athena_example.ipynb deleted file mode 100644 index 089fb9e0..00000000 --- a/notebooks/athena_example.ipynb +++ /dev/null @@ -1,418 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%writefile ../athena_example/athena_example.py\n", - "\n", - "\"\"\"\n", - "This is an example of working with very large data. There are about\n", - "700,000 unduplicated donors in this database of Illinois political\n", - "campaign contributions.\n", - "\n", - "With such a large set of input data, we cannot store all the comparisons\n", - "we need to make in memory. Instead, we will read the pairs on demand\n", - "from the Athena database.\n", - "\n", - "__Note:__ You will need to run `python athena_init_db.py`\n", - "before running this script. See the annotates source for\n", - "[athena_init_db.py](athena_init_db.html)\n", - "\n", - "For smaller datasets (<10,000), see our\n", - "[csv_example](csv_example.html)\n", - "\"\"\"\n", - "\n", - "# There is a little bit difference between the result \n", - "# of this module and the athena one. The reason is due to\n", - "# Some special (and mostly erroneous) characters, such as \\a .. \n", - "# Which are dealt with differently by athena and athena/panda\n", - "\n", - "import sys\n", - "import os\n", - "import itertools\n", - "import time\n", - "import logging\n", - "import optparse\n", - "import locale\n", - "import json\n", - "from io import StringIO\n", - "import csv\n", - "import pandas as pd\n", - "\n", - "import boto3\n", - "import dedupe\n", - "import dedupe.backport\n", - "sys.path.insert(0, '../athena_example/')\n", - "import config\n", - "sys.path.insert(0, '../athena_example/')\n", - "import athenautils\n", - "\n", - "def cursor_execute(query, database):\n", - " '''\n", - " The MySQL compatible Cursor\n", - " '''\n", - " return athenautils.cursor_execute(query, database=database, \n", - " cursortype='tuple', buffersize=config.BUFFERSIZE,\n", - " escapechar=None, keep_default_na=False, na_values=[''])\n", - "\n", - "def dict_cursor_execute(query, database):\n", - " '''\n", - " The MySQL compatible DicCursor\n", - " '''\n", - " return athenautils.cursor_execute(query, database=database, \n", - " cursortype='dict', buffersize=config.BUFFERSIZE,\n", - " escapechar=None, keep_default_na=False, na_values=[''])\n", - "def record_pairs(result_set):\n", - " for i, row in enumerate(result_set):\n", - " a_record_id, a_record, b_record_id, b_record = row\n", - " record_a = (a_record_id, json.loads(a_record))\n", - " record_b = (b_record_id, json.loads(b_record))\n", - "\n", - " yield record_a, record_b\n", - "\n", - " if i % 10000 == 0:\n", - " print(i)\n", - "\n", - "\n", - "def cluster_ids(clustered_dupes):\n", - "\n", - " for cluster, scores in clustered_dupes:\n", - " cluster_id = cluster[0]\n", - " for donor_id, score in zip(cluster, scores):\n", - " yield donor_id, cluster_id, score\n", - "\n", - "\n", - "if __name__ == '__main__':\n", - "\n", - " ## Logging\n", - "\n", - " # Dedupe uses Python logging to show or suppress verbose output. Added\n", - " # for convenience. To enable verbose output, run `python\n", - " # examples/athena_example/athena_example.py -v`\n", - " \n", - " optp = optparse.OptionParser()\n", - " optp.add_option('-v', '--verbose', dest='verbose', action='count',\n", - " help='Increase verbosity (specify multiple times for more)'\n", - " )\n", - " (opts, args) = optp.parse_args()\n", - " log_level = logging.WARNING\n", - " if opts.verbose:\n", - " if opts.verbose == 1:\n", - " log_level = logging.INFO\n", - " elif opts.verbose >= 2:\n", - " log_level = logging.DEBUG\n", - "\n", - "\n", - " logging.getLogger().setLevel(log_level)\n", - "\n", - " \n", - "\n", - "\n", - " settings_file = 'athena_example_settings'\n", - " training_file = 'athena_example_training.json'\n", - "\n", - " start_time = time.time()\n", - "\n", - " # We'll be using variations on this following select statement to pull\n", - " # in campaign donor info.\n", - " #\n", - " # We did a fair amount of preprocessing of the fields in\n", - " # `athena_init_db.py` \n", - " DONOR_SELECT = \"\"\"SELECT donor_id, city, name, zip, state, address\n", - " from as_processed_donors\"\"\"\n", - "\n", - " # ## Training\n", - "\n", - " if os.path.exists(settings_file):\n", - " print('reading from ', settings_file)\n", - " with open(settings_file, 'rb') as sf:\n", - " deduper = dedupe.StaticDedupe(sf, num_cores=4)\n", - " else:\n", - " # Define the fields dedupe will pay attention to\n", - " #\n", - " # The address, city, and zip fields are often missing, so we'll\n", - " # tell dedupe that, and we'll learn a model that take that into\n", - " # account\n", - " fields = [{'field': 'name', 'type': 'String'},\n", - " {'field': 'address', 'type': 'String',\n", - " 'has missing': True},\n", - " {'field': 'city', 'type': 'ShortString', 'has missing': True},\n", - " {'field': 'state', 'type': 'ShortString', 'has missing': True},\n", - " {'field': 'zip', 'type': 'ShortString', 'has missing': True},\n", - " ]\n", - "\n", - " # Create a new deduper object and pass our data model to it.\n", - " deduper = dedupe.Dedupe(fields, num_cores=4)\n", - "\n", - " # We will sample pairs from the entire donor table for training\n", - " cur = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE)\n", - " temp_d = {i: row for i, row in enumerate(cur)}\n", - " \n", - "\n", - " # If we have training data saved from a previous run of dedupe,\n", - " # look for it an load it in.\n", - " #\n", - " # __Note:__ if you want to train from\n", - " # scratch, delete the training_file\n", - " if os.path.exists(training_file):\n", - " print('reading labeled examples from ', training_file)\n", - " with open(training_file) as tf:\n", - " deduper.prepare_training(temp_d, training_file=tf)\n", - " else:\n", - " deduper.prepare_training(temp_d)\n", - "\n", - " del temp_d\n", - "\n", - " # ## Active learning\n", - "\n", - " print('starting active labeling...')\n", - " # Starts the training loop. Dedupe will find the next pair of records\n", - " # it is least certain about and ask you to label them as duplicates\n", - " # or not.\n", - "\n", - " # use 'y', 'n' and 'u' keys to flag duplicates\n", - " # press 'f' when you are finished\n", - " dedupe.convenience.console_label(deduper)\n", - " # When finished, save our labeled, training pairs to disk\n", - " with open(training_file, 'w') as tf:\n", - " deduper.write_training(tf)\n", - "\n", - " # Notice our the argument here\n", - " #\n", - " # `recall` is the proportion of true dupes pairs that the learned\n", - " # rules must cover. You may want to reduce this if your are making\n", - " # too many blocks and too many comparisons.\n", - " deduper.train(recall=0.90)\n", - "\n", - " with open(settings_file, 'wb') as sf:\n", - " deduper.write_settings(sf)\n", - "\n", - " # We can now remove some of the memory hobbing objects we used\n", - " # for training\n", - " deduper.cleanup_training()\n", - "\n", - " # ## Blocking\n", - "\n", - " print('blocking...')\n", - "\n", - " # To run blocking on such a large set of data, we create a separate table\n", - " # that contains blocking keys and record ids\n", - " print('creating as_blocking_map database')\n", - " athenautils.drop_external_table(\"as_blocking_map\", \n", - " location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map'),\n", - " database=config.DATABASE)\n", - "\n", - " q=\"\"\"\n", - " CREATE EXTERNAL TABLE as_blocking_map \n", - " (block_key VARCHAR(200), donor_id INTEGER)\n", - " ROW FORMAT DELIMITED\n", - " FIELDS TERMINATED BY '\\t'\n", - " LINES TERMINATED BY '\\n' \n", - " LOCATION\n", - " 's3://{}/{}' \n", - " TBLPROPERTIES (\n", - " 'classification'='csv', \n", - " --'skip.header.line.count'='1', \n", - " 'serialization.null.format'='')\n", - " \"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map') \n", - " athenautils.athena_start_query(q, database=config.DATABASE)\n", - "\n", - " # If dedupe learned a Index Predicate, we have to take a pass\n", - " # through the data and create indices.\n", - " print('creating inverted index')\n", - "\n", - " # Armin: \n", - " # This never runs, index_fields is empty, possible bug?\n", - " for field in deduper.fingerprinter.index_fields:\n", - " q = \"\"\"\n", - " SELECT DISTINCT {field} FROM as_processed_donors\n", - " WHERE {field} IS NOT NULL\n", - " \"\"\".format(field=field)\n", - " cur = dict_cursor_execute(q, databse=config.DATABASE)\n", - " field_data = (row[field] for row in cur)\n", - " deduper.fingerprinter.index(field_data, field)\n", - " \n", - "\n", - " # Now we are ready to write our blocking map table by creating a\n", - " # generator that yields unique `(block_key, donor_id)` tuples.\n", - " print('writing blocking map')\n", - " \n", - " read_cur = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE)\n", - " full_data = ((row['donor_id'], row) for row in read_cur)\n", - "\n", - " b_data = deduper.fingerprinter(full_data)\n", - " athenautils.write_many(b_data, \n", - " filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map/blocking.csv'))\n", - "\n", - "\n", - " # select unique pairs to compare\n", - " q=\"\"\"\n", - " SELECT a.donor_id,\n", - " json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'],\n", - " ARRAY[ a.city, a.name, a.zip, a.state, a.address])\n", - " AS JSON)),\n", - " b.donor_id,\n", - " json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], \n", - " ARRAY[ b.city, b.name, b.zip, b.state, b.address])\n", - " AS JSON))\n", - " FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west\n", - " from as_blocking_map as l\n", - " INNER JOIN as_blocking_map as r\n", - " using (block_key)\n", - " where l.donor_id < r.donor_id) ids\n", - " INNER JOIN as_processed_donors a on ids.east=a.donor_id\n", - " INNER JOIN as_processed_donors b on ids.west=b.donor_id\n", - " \"\"\"\n", - " read_cur = cursor_execute(q, database=config.DATABASE)\n", - "\n", - "\n", - " # ## Clustering\n", - "\n", - " print('clustering...')\n", - " clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)),\n", - " threshold=0.5)\n", - "\n", - "# athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_entity_map\", database=config.DATABASE)\n", - " athenautils.drop_external_table(\"as_entity_map\", \n", - " location='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/'), \n", - " database=config.DATABASE)\n", - " \n", - " print('creating as_entity_map database')\n", - " q=\"\"\"\n", - " CREATE EXTERNAL TABLE as_entity_map \n", - " (donor_id INTEGER, canon_id INTEGER, \n", - " cluster_score FLOAT)\n", - " ROW FORMAT DELIMITED\n", - " FIELDS TERMINATED BY '\\t'\n", - " LINES TERMINATED BY '\\n' \n", - " LOCATION\n", - " 's3://{}/{}' \n", - " TBLPROPERTIES (\n", - " 'classification'='csv', \n", - " --'skip.header.line.count'='1', \n", - " 'serialization.null.format'='')\n", - " \"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map') \n", - " athenautils.athena_start_query(q, database=config.DATABASE) \n", - "\n", - " athenautils.write_many(cluster_ids(clustered_dupes),\n", - " filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/entity_map.csv'))\n", - "\n", - " # Print out the number of duplicates found\n", - " print('# duplicate sets')\n", - "\n", - " # ## Payoff\n", - "\n", - " # With all this done, we can now begin to ask interesting questions\n", - " # of the data\n", - " #\n", - " # For example, let's see who the top 10 donors are.\n", - "\n", - " locale.setlocale(locale.LC_ALL, 'en_CA.UTF-8') # for pretty printing numbers\n", - " \n", - " athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_e_map\", database=config.DATABASE)\n", - " \n", - " q = \"\"\"\n", - " CREATE TABLE as_e_map as \n", - " SELECT COALESCE(canon_id, as_entity_map.donor_id) AS canon_id, as_entity_map.donor_id \n", - " FROM as_entity_map \n", - " RIGHT JOIN as_donors USING(donor_id) \n", - " \"\"\" \n", - " athenautils.athena_start_query(q, database=config.DATABASE)\n", - " \n", - " q = \"\"\"\n", - " SELECT array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name, \n", - " donation_totals.totals AS totals \n", - " FROM as_donors INNER JOIN \n", - " (SELECT canon_id, SUM(cast (amount as double)) AS totals \n", - " FROM as_contributions INNER JOIN as_e_map \n", - " USING (donor_id) \n", - " GROUP BY (canon_id) \n", - " ORDER BY totals \n", - " DESC LIMIT 10) \n", - " AS donation_totals \n", - " ON as_donors.donor_id = donation_totals.canon_id\n", - " ORDER BY totals DESC\n", - " \"\"\"\n", - " cur = dict_cursor_execute(q, database=config.DATABASE)\n", - "\n", - " print(\"Top Donors (deduped)\")\n", - " for row in cur:\n", - " row['totals'] = locale.currency(row['totals'], grouping=True)\n", - " print('%(totals)20s: %(name)s' % row)\n", - "\n", - " # Compare this to what we would have gotten if we hadn't done any\n", - " # deduplication\n", - " q = \"\"\"\n", - " with donorscontributions as(\n", - "\n", - " SELECT as_donors.donor_id, \n", - " array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,\n", - " cast(as_contributions.amount as double) as amount\n", - " FROM as_donors INNER JOIN as_contributions \n", - " USING (donor_id) \n", - " )\n", - " SELECT name, sum(amount) AS totals \n", - " FROM donorscontributions\n", - " GROUP BY donor_id, name\n", - " ORDER BY totals DESC \n", - " LIMIT 10\n", - " \"\"\"\n", - " cur = dict_cursor_execute(q, database=config.DATABASE)\n", - "\n", - " print(\"Top Donors (raw)\")\n", - " for row in cur:\n", - " row['totals'] = locale.currency(row['totals'], grouping=True)\n", - " print('%(totals)20s: %(name)s' % row)\n", - "\n", - " print('ran in', time.time() - start_time, 'seconds')\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!python ../athena_example/athena_example.py" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "conda_python3", - "language": "python", - "name": "conda_python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.10" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "state": {}, - "version_major": 2, - "version_minor": 0 - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/athena_init_db.ipynb b/notebooks/athena_init_db.ipynb deleted file mode 100644 index d35250de..00000000 --- a/notebooks/athena_init_db.ipynb +++ /dev/null @@ -1,277 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# !pip install dedupe" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%writefile ../athena_example/config.py\n", - "LOG_FILE = 'log.txt'\n", - "\n", - "# Connection parameters\n", - "ACCESS_KEY_ID = None\n", - "SECRET_ACCESS_KEY = None\n", - "ATHENA_GARBAGE_PATH = 's3://aws-athena-query-results-rds'\n", - "WORKGROUP = 'RDS'\n", - "REGION = 'eu-west-1'\n", - "DATABASE = 'ria_tmp'\n", - "\n", - "# Database Parameters\n", - "DATABASE_BUCKET = 'ria-temp'\n", - "DATABASE_ROOT_KEY = 'as_dedupe/'\n", - "BUFFERSIZE = 100000" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%writefile ../athena_example/athena_init.py\n", - "#!/usr/bin/python\n", - "\"\"\"\n", - "This is a setup script for athena_example. It downloads a zip file of\n", - "Illinois campaign contributions and loads them into a Athena database\n", - "named 'contributions'.\n", - " \n", - "__Note:__ You will need to run this script first before execuing\n", - "[athena_example.py](athena_example.py).\n", - " \n", - "Tables created:\n", - "* as_raw_table - raw import of entire CSV file\n", - "* donors - all distinct donors based on name and address\n", - "* recipients - all distinct campaign contribution recipients\n", - "* contributions - contribution amounts tied to donor and recipients tables\n", - "\"\"\"\n", - "\n", - "import os\n", - "import zipfile\n", - "import warnings\n", - "import pandas as pd\n", - "import numpy as np\n", - "from urllib.request import urlopen\n", - "import boto3\n", - "import config\n", - "import csv\n", - "import sys\n", - "sys.path.insert(0, '../athena_example/')\n", - "import athenautils\n", - "\n", - "\n", - "contributions_zip_file = 'Illinois-campaign-contributions.txt.zip'\n", - "contributions_txt_file = 'Illinois-campaign-contributions.txt'\n", - "\n", - "if not os.path.exists(contributions_zip_file) :\n", - " print('downloading', contributions_zip_file, '(~60mb) ...')\n", - " u = urlopen('https://s3.amazonaws.com/dedupe-data/Illinois-campaign-contributions.txt.zip')\n", - " localFile = open(contributions_zip_file, 'wb')\n", - " localFile.write(u.read())\n", - " localFile.close()\n", - "\n", - "if not os.path.exists(contributions_txt_file) :\n", - " zip_file = zipfile.ZipFile(contributions_zip_file, 'r')\n", - " print('extracting %s' % contributions_zip_file)\n", - " zip_file_contents = zip_file.namelist()\n", - " for f in zip_file_contents:\n", - " if ('.txt' in f):\n", - " zip_file.extract(f)\n", - " zip_file.close()\n", - "\n", - "\n", - "\n", - "\n", - "print('importing raw data from csv...')\n", - "athenautils.drop_external_table(\"as_raw_table\", \n", - " location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table'),\n", - " database=config.DATABASE) \n", - "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_donors\", database=config.DATABASE)\n", - "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_recipients\", database=config.DATABASE)\n", - "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_contributions\", database=config.DATABASE)\n", - "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_processed_donors\", database=config.DATABASE)\n", - "\n", - "\n", - "q=r\"\"\"\n", - "CREATE EXTERNAL TABLE as_raw_table \n", - " (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), \n", - " address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), \n", - " state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), \n", - " date_recieved VARCHAR(10), loan_amount VARCHAR(12), \n", - " amount VARCHAR(23), receipt_type VARCHAR(23), \n", - " employer VARCHAR(70), occupation VARCHAR(40), \n", - " vendor_last_name VARCHAR(70), vendor_first_name VARCHAR(20), \n", - " vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), \n", - " vendor_city VARCHAR(20), vendor_state VARCHAR(10), \n", - " vendor_zip VARCHAR(10), description VARCHAR(90), \n", - " election_type VARCHAR(10), election_year VARCHAR(10), \n", - " report_period_begin VARCHAR(10), report_period_end VARCHAR(33), \n", - " committee_name VARCHAR(70), committee_id VARCHAR(37)) \n", - "ROW FORMAT DELIMITED\n", - " FIELDS TERMINATED BY '\\t'\n", - " ESCAPED BY '\\\\'\n", - " LINES TERMINATED BY '\\n' \n", - "LOCATION\n", - " 's3://{}/{}' \n", - "TBLPROPERTIES (\n", - " 'classification'='csv', \n", - " 'skip.header.line.count'='1', \n", - " 'serialization.null.format'='')\n", - "\"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table') \n", - "athenautils.athena_start_query(q, database=config.DATABASE)\n", - "\n", - "\n", - "df_cursor = pd.read_csv(contributions_txt_file, sep='\\t', escapechar='\\\\', quoting=csv.QUOTE_NONE, \n", - " error_bad_lines=False, warn_bad_lines=True, dtype=str, keep_default_na=False, na_values=[''],\n", - " chunksize=config.BUFFERSIZE)\n", - "chunkcount = 0\n", - "filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'as_raw_table', os.path.splitext(contributions_txt_file)[0]+'.csv')\n", - "for df in df_cursor: \n", - " # Remove the very few records that mess up the demo \n", - " # (demo purposes only! Don't do something like this in production)\n", - " df = df[df['RcvDate'].str.len()>=10]\n", - "\n", - " # set empty, non-zero, strings in date columns to null\n", - " df.loc[df['RptPdBegDate'].str.len()<10,'RptPdBegDate'] = np.nan\n", - "\n", - " df.loc[df['RptPdEndDate'].str.len()<10,'RptPdEndDate'] = np.nan\n", - "\n", - " #committee ID is requred. Remove the 2 rows that don't have it.\n", - " df = df[df['ID']!='']\n", - "\n", - " # There's a record with a date stuck in the committee_id column, which causes\n", - " # problems when inserting into the contributions table below. Get rid of it this \n", - " # way.\n", - " df = df[df['ID'].str.len() <=9]\n", - "\n", - " # dropping the last columns\n", - " df = df.drop(columns='Unnamed: 29')\n", - "\n", - " df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand')\n", - " \n", - " buffer = df_lower.to_csv(quoting=csv.QUOTE_NONE, sep=\"\\t\", escapechar='\\\\', index=None)\n", - " \n", - " chunk_fname = athenautils.file_name_append(filename, '_{}'.format(chunkcount), ommitext=False)\n", - " athenautils.write(body=buffer, filename=chunk_fname)\n", - " chunkcount += 1 \n", - " \n", - "print('creating donors table...')\n", - "q=\"\"\"\n", - "CREATE TABLE as_donors as\n", - " with tmp as\n", - " (SELECT DISTINCT \n", - " NULLIF(TRIM(last_name), '') as last_name, \n", - " NULLIF(TRIM(first_name), '') as first_name, \n", - " NULLIF(TRIM(address_1), '') as address_1, \n", - " NULLIF(TRIM(address_2), '') as address_2, \n", - " NULLIF(TRIM(city), '') city, \n", - " NULLIF(TRIM(state), '') as state, \n", - " NULLIF(TRIM(zip), '') as zip, \n", - " NULLIF(TRIM(employer), '') as employer, \n", - " NULLIF(TRIM(occupation), '') as occupation\n", - " FROM as_raw_table)\n", - " SELECT row_number() over () as donor_id, * from tmp\"\"\"\n", - "athenautils.athena_start_query(q, database=config.DATABASE)\n", - "\n", - "\n", - "q=\"\"\"\n", - "CREATE TABLE as_recipients as\n", - " SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM as_raw_table\n", - "\"\"\"\n", - "athenautils.athena_start_query(q, database=config.DATABASE)\n", - "\n", - "print('creating contributions table')\n", - "\n", - "q=\"\"\"\n", - "CREATE TABLE as_contributions as\n", - " SELECT reciept_id as contribution_id, \n", - " donors.donor_id as donor_id , \n", - " committee_id as recipient_id, \n", - " report_type, date_parse(date_recieved, '%m/%d/%Y') as date_recieved, \n", - " loan_amount, amount, \n", - " receipt_type, vendor_last_name , \n", - " vendor_first_name, vendor_address_1, vendor_address_2, \n", - " vendor_city, vendor_state, vendor_zip, description, \n", - " election_type, election_year, \n", - " date_parse(report_period_begin, '%m/%d/%Y') as report_period_begin, \n", - " date_parse(report_period_end, '%m/%d/%Y') as report_period_end \n", - " FROM as_raw_table JOIN as_donors donors ON \n", - " coalesce(donors.first_name, '') = coalesce(TRIM(as_raw_table.first_name), '') AND \n", - " coalesce(donors.last_name, '') = coalesce(TRIM(as_raw_table.last_name), '') AND \n", - " coalesce(donors.address_1, '') = coalesce(TRIM(as_raw_table.address_1), '') AND \n", - " coalesce(donors.address_2, '') = coalesce(TRIM(as_raw_table.address_2), '') AND \n", - " coalesce(donors.city, '') = coalesce(TRIM(as_raw_table.city), '') AND \n", - " coalesce(donors.state, '') = coalesce(TRIM(as_raw_table.state), '') AND \n", - " coalesce(donors.employer, '') = coalesce(TRIM(as_raw_table.employer), '') AND \n", - " coalesce(donors.occupation , '')= coalesce(TRIM(as_raw_table.occupation), '') AND \n", - " coalesce(donors.zip, '') = coalesce(TRIM(as_raw_table.zip), '')\"\"\"\n", - "\n", - "athenautils.athena_start_query(q, database=config.DATABASE)\n", - "\n", - "q = \"\"\"\n", - "CREATE TABLE as_processed_donors AS \n", - " SELECT donor_id, \n", - " LOWER(city) AS city, \n", - " CASE WHEN (first_name IS NULL AND last_name IS NULL) \n", - " THEN NULL \n", - " ELSE LOWER(array_join(filter(array[first_name, last_name], x-> x IS NOT NULL), ' ')) \n", - " END AS name, \n", - " LOWER(zip) AS zip, \n", - " LOWER(state) AS state, \n", - " CASE WHEN (address_1 IS NULL AND address_2 IS NULL) \n", - " THEN NULL \n", - " ELSE LOWER(array_join(filter(array[address_1, address_2], x-> x IS NOT NULL), ' '))\n", - " END AS address, \n", - " LOWER(occupation) AS occupation, \n", - " LOWER(employer) AS employer, \n", - " first_name is null AS person \n", - " FROM as_donors\"\"\"\n", - "athenautils.athena_start_query(q, database=config.DATABASE)\n", - "\n", - "\n", - "\n", - "\n", - "print('done')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!python ../athena_example/athena_init.py" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "conda_python3", - "language": "python", - "name": "conda_python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.10" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} From e547f88b441bd4cd41471c1c5507594a3c5c2060 Mon Sep 17 00:00:00 2001 From: asajadi Date: Sat, 8 May 2021 17:03:12 -0400 Subject: [PATCH 16/19] removing utils.py --- .gitignore | 1 - athena_example/utils.py | 138 ---------------------------------------- 2 files changed, 139 deletions(-) delete mode 100644 athena_example/utils.py diff --git a/.gitignore b/.gitignore index a29de92b..3fb24683 100644 --- a/.gitignore +++ b/.gitignore @@ -26,4 +26,3 @@ ENV distpgsql_init_db.py pgsql_example/pgsql_init_db.py .idea -.ipynb_checkpoints* diff --git a/athena_example/utils.py b/athena_example/utils.py deleted file mode 100644 index 80922548..00000000 --- a/athena_example/utils.py +++ /dev/null @@ -1,138 +0,0 @@ -from __future__ import print_function -import re -import boto3 -import botocore -import sys -import datetime -import os -import time -import pandas as pd -from six import string_types -import sys -pyver = sys.version_info[0] - -if pyver<3: - from StringIO import StringIO as SomethingIO - from urlparse import urlparse -else: - from io import BytesIO as SomethingIO - from urllib.parse import urlparse - -sys.path.insert(0, '../athena_example/') -import config - -s3 = boto3.client('s3', region_name=config.REGION, - aws_access_key_id=config.ACCESS_KEY_ID, aws_secret_access_key=config.SECRET_ACCESS_KEY) - -athena = boto3.client('athena', region_name=config.REGION, - aws_access_key_id=config.ACCESS_KEY_ID, aws_secret_access_key=config.SECRET_ACCESS_KEY) - -def athena_to_panda(query, database=config.DATABASE, output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, **kwargs): - query_execution_id = athena_start_query(query, database, output_location, region, workgroup, wait_until_finished=True) - df = pandas_read_csv(os.path.join(output_location, query_execution_id+'.csv'), **kwargs) - return df - - -def athena_start_query(query, database=config.DATABASE, output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, wait_until_finished=True): - query_execution_id = athena.start_query_execution( - QueryString=query, - QueryExecutionContext={ - 'Database': database - }, - WorkGroup=workgroup, - ResultConfiguration={ - "OutputLocation": output_location - } - )['QueryExecutionId'] - - seconds_to_wait = 1 - - if wait_until_finished: - while True: - time.sleep(seconds_to_wait) - seconds_to_wait += 1 -# seconds_to_wait *= 2 - - execution = athena.get_query_execution( - QueryExecutionId=query_execution_id - ) - - if execution['QueryExecution']['Status']['State'] not in ['QUEUED', 'RUNNING']: - break - - if execution['QueryExecution']['Status']['State'] != 'SUCCEEDED': - raise Exception("Athena query failed: %s" % ( execution['QueryExecution']['Status']['StateChangeReason'],), query_execution_id) - - return query_execution_id - -# Copied from https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py -# Import it instead, when it's updated. -def is_s3_url(url): - """Check for an s3, s3n, or s3a url""" - try: - return urlparse(url).scheme in ["s3", "s3n", "s3a"] - except Exception: - return False - -def seperate_bucket_key(url): - m = re.match('s3://([^/]+)/(.*)', url) - return m.group(1), m.group(2) - -def list_all(path): - if is_s3_url(path): - bucket, key = seperate_bucket_key(path) - objects = s3.list_objects_v2(Bucket=bucket, Prefix=key) - return [key['Key'] for key in objects['Contents']] - from os import listdir - from os.path import isfile, join - return listdir(path) - - -def pandas_read_csv(filepath_or_buffer, verbose=True, **kwargs): - return pd.read_csv(filepath_or_buffer, **kwargs) - -def reader(filename, Range='string', verbose=True): - ''' - Range: look at: https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35 - ''' - log ("Reading {}".format(filename), verbose=verbose) - if is_s3_url(filename): - bucket, key = seperate_bucket_key(filename) - obj=s3.get_object(Bucket=bucket, Key=key, Range=Range) - return obj['Body'] - -def write(body, filename): - bucket, key = seperate_bucket_key(filename) - s3.put_object(Bucket=bucket, Key=key, Body=body) - return - - -def file_exists(filename): - bucket, key = seperate_bucket_key(filename) - try: - s3.get_object(Bucket=bucket, Key=key) - except botocore.exceptions.ClientError as e: - if e.response['Error']['Code']=='NoSuchKey': - return False - else: - # Something else has gone wrong. - raise - else: - return True - - -def log(outstr, logfile_name=config.LOG_FILE, timestamped=True, verbose=True, quiet=False): - if verbose == False: - return - if timestamped: - outstr = "[%s]\t%s\n" % (str(datetime.datetime.now()) , outstr) - else: - outstr = "%s\n" % (outstr,) - - with open(logfile_name, "a") as logfile: - logfile.write(outstr) - - if not quiet: - sys.stdout.write(outstr); - sys.stdout.flush() -# Print iterations progress From 2667c9ea2c7effd8ace8053bccd4e3c1c5671e1f Mon Sep 17 00:00:00 2001 From: asajadi Date: Sat, 8 May 2021 17:25:07 -0400 Subject: [PATCH 17/19] renaming tables --- athena_example/athena_example.py | 64 ++++++++++++++++---------------- athena_example/athena_init.py | 54 +++++++++++++-------------- 2 files changed, 59 insertions(+), 59 deletions(-) diff --git a/athena_example/athena_example.py b/athena_example/athena_example.py index c1c7fb3c..04837157 100644 --- a/athena_example/athena_example.py +++ b/athena_example/athena_example.py @@ -113,7 +113,7 @@ def cluster_ids(clustered_dupes): # We did a fair amount of preprocessing of the fields in # `athena_init_db.py` DONOR_SELECT = """SELECT donor_id, city, name, zip, state, address - from as_processed_donors""" + from processed_donors""" # ## Training @@ -191,13 +191,13 @@ def cluster_ids(clustered_dupes): # To run blocking on such a large set of data, we create a separate table # that contains blocking keys and record ids - print('creating as_blocking_map database') - athenautils.drop_external_table("as_blocking_map", - location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map'), + print('creating blocking_map database') + athenautils.drop_external_table("blocking_map", + location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'blocking_map'), database=config.DATABASE) q=""" - CREATE EXTERNAL TABLE as_blocking_map + CREATE EXTERNAL TABLE blocking_map (block_key VARCHAR(200), donor_id INTEGER) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' @@ -208,7 +208,7 @@ def cluster_ids(clustered_dupes): 'classification'='csv', --'skip.header.line.count'='1', 'serialization.null.format'='') - """.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map') + """.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'blocking_map') athenautils.athena_start_query(q, database=config.DATABASE) # If dedupe learned a Index Predicate, we have to take a pass @@ -219,7 +219,7 @@ def cluster_ids(clustered_dupes): # This never runs, index_fields is empty, possible bug? for field in deduper.fingerprinter.index_fields: q = """ - SELECT DISTINCT {field} FROM as_processed_donors + SELECT DISTINCT {field} FROM processed_donors WHERE {field} IS NOT NULL """.format(field=field) cur = dict_cursor_execute(q, databse=config.DATABASE) @@ -236,7 +236,7 @@ def cluster_ids(clustered_dupes): b_data = deduper.fingerprinter(full_data) athenautils.write_many(b_data, - filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map/blocking.csv')) + filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'blocking_map/blocking.csv')) # select unique pairs to compare @@ -250,12 +250,12 @@ def cluster_ids(clustered_dupes): ARRAY[ b.city, b.name, b.zip, b.state, b.address]) AS JSON)) FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west - from as_blocking_map as l - INNER JOIN as_blocking_map as r + from blocking_map as l + INNER JOIN blocking_map as r using (block_key) where l.donor_id < r.donor_id) ids - INNER JOIN as_processed_donors a on ids.east=a.donor_id - INNER JOIN as_processed_donors b on ids.west=b.donor_id + INNER JOIN processed_donors a on ids.east=a.donor_id + INNER JOIN processed_donors b on ids.west=b.donor_id """ read_cur = cursor_execute(q, database=config.DATABASE) @@ -266,14 +266,14 @@ def cluster_ids(clustered_dupes): clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)), threshold=0.5) -# athenautils.athena_start_query("DROP TABLE IF EXISTS as_entity_map", database=config.DATABASE) - athenautils.drop_external_table("as_entity_map", - location='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/'), +# athenautils.athena_start_query("DROP TABLE IF EXISTS entity_map", database=config.DATABASE) + athenautils.drop_external_table("entity_map", + location='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'entity_map/'), database=config.DATABASE) - print('creating as_entity_map database') + print('creating entity_map database') q=""" - CREATE EXTERNAL TABLE as_entity_map + CREATE EXTERNAL TABLE entity_map (donor_id INTEGER, canon_id INTEGER, cluster_score FLOAT) ROW FORMAT DELIMITED @@ -285,11 +285,11 @@ def cluster_ids(clustered_dupes): 'classification'='csv', --'skip.header.line.count'='1', 'serialization.null.format'='') - """.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map') + """.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'entity_map') athenautils.athena_start_query(q, database=config.DATABASE) athenautils.write_many(cluster_ids(clustered_dupes), - filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/entity_map.csv')) + filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'entity_map/entity_map.csv')) # Print out the number of duplicates found print('# duplicate sets') @@ -303,28 +303,28 @@ def cluster_ids(clustered_dupes): locale.setlocale(locale.LC_ALL, 'en_CA.UTF-8') # for pretty printing numbers - athenautils.athena_start_query("DROP TABLE IF EXISTS as_e_map", database=config.DATABASE) + athenautils.athena_start_query("DROP TABLE IF EXISTS e_map", database=config.DATABASE) q = """ - CREATE TABLE as_e_map as - SELECT COALESCE(canon_id, as_entity_map.donor_id) AS canon_id, as_entity_map.donor_id - FROM as_entity_map - RIGHT JOIN as_donors USING(donor_id) + CREATE TABLE e_map as + SELECT COALESCE(canon_id, entity_map.donor_id) AS canon_id, entity_map.donor_id + FROM entity_map + RIGHT JOIN donors USING(donor_id) """ athenautils.athena_start_query(q, database=config.DATABASE) q = """ - SELECT array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name, + SELECT array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name, donation_totals.totals AS totals - FROM as_donors INNER JOIN + FROM donors INNER JOIN (SELECT canon_id, SUM(cast (amount as double)) AS totals - FROM as_contributions INNER JOIN as_e_map + FROM contributions INNER JOIN e_map USING (donor_id) GROUP BY (canon_id) ORDER BY totals DESC LIMIT 10) AS donation_totals - ON as_donors.donor_id = donation_totals.canon_id + ON donors.donor_id = donation_totals.canon_id ORDER BY totals DESC """ cur = dict_cursor_execute(q, database=config.DATABASE) @@ -339,10 +339,10 @@ def cluster_ids(clustered_dupes): q = """ with donorscontributions as( - SELECT as_donors.donor_id, - array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name, - cast(as_contributions.amount as double) as amount - FROM as_donors INNER JOIN as_contributions + SELECT donors.donor_id, + array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name, + cast(contributions.amount as double) as amount + FROM donors INNER JOIN contributions USING (donor_id) ) SELECT name, sum(amount) AS totals diff --git a/athena_example/athena_init.py b/athena_example/athena_init.py index 45a5e254..099c5489 100644 --- a/athena_example/athena_init.py +++ b/athena_example/athena_init.py @@ -8,7 +8,7 @@ [athena_example.py](athena_example.py). Tables created: -* as_raw_table - raw import of entire CSV file +* raw_table - raw import of entire CSV file * donors - all distinct donors based on name and address * recipients - all distinct campaign contribution recipients * contributions - contribution amounts tied to donor and recipients tables @@ -51,17 +51,17 @@ print('importing raw data from csv...') -athenautils.drop_external_table("as_raw_table", - location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table'), +athenautils.drop_external_table("raw_table", + location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'raw_table'), database=config.DATABASE) -athenautils.athena_start_query("DROP TABLE IF EXISTS as_donors", database=config.DATABASE) -athenautils.athena_start_query("DROP TABLE IF EXISTS as_recipients", database=config.DATABASE) -athenautils.athena_start_query("DROP TABLE IF EXISTS as_contributions", database=config.DATABASE) -athenautils.athena_start_query("DROP TABLE IF EXISTS as_processed_donors", database=config.DATABASE) +athenautils.athena_start_query("DROP TABLE IF EXISTS donors", database=config.DATABASE) +athenautils.athena_start_query("DROP TABLE IF EXISTS recipients", database=config.DATABASE) +athenautils.athena_start_query("DROP TABLE IF EXISTS contributions", database=config.DATABASE) +athenautils.athena_start_query("DROP TABLE IF EXISTS processed_donors", database=config.DATABASE) q=r""" -CREATE EXTERNAL TABLE as_raw_table +CREATE EXTERNAL TABLE raw_table (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), @@ -85,7 +85,7 @@ 'classification'='csv', 'skip.header.line.count'='1', 'serialization.null.format'='') -""".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table') +""".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'raw_table') athenautils.athena_start_query(q, database=config.DATABASE) @@ -93,7 +93,7 @@ error_bad_lines=False, warn_bad_lines=True, dtype=str, keep_default_na=False, na_values=[''], chunksize=config.BUFFERSIZE) chunkcount = 0 -filename=os.path.join("s3://", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'as_raw_table', os.path.splitext(contributions_txt_file)[0]+'.csv') +filename=os.path.join("s3://", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'raw_table', os.path.splitext(contributions_txt_file)[0]+'.csv') for df in df_cursor: # Remove the very few records that mess up the demo # (demo purposes only! Don't do something like this in production) @@ -125,7 +125,7 @@ print('creating donors table...') q=""" -CREATE TABLE as_donors as +CREATE TABLE donors as with tmp as (SELECT DISTINCT NULLIF(TRIM(last_name), '') as last_name, @@ -137,21 +137,21 @@ NULLIF(TRIM(zip), '') as zip, NULLIF(TRIM(employer), '') as employer, NULLIF(TRIM(occupation), '') as occupation - FROM as_raw_table) + FROM raw_table) SELECT row_number() over () as donor_id, * from tmp""" athenautils.athena_start_query(q, database=config.DATABASE) q=""" -CREATE TABLE as_recipients as - SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM as_raw_table +CREATE TABLE recipients as + SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM raw_table """ athenautils.athena_start_query(q, database=config.DATABASE) print('creating contributions table') q=""" -CREATE TABLE as_contributions as +CREATE TABLE contributions as SELECT reciept_id as contribution_id, donors.donor_id as donor_id , committee_id as recipient_id, @@ -163,21 +163,21 @@ election_type, election_year, date_parse(report_period_begin, '%m/%d/%Y') as report_period_begin, date_parse(report_period_end, '%m/%d/%Y') as report_period_end - FROM as_raw_table JOIN as_donors donors ON - coalesce(donors.first_name, '') = coalesce(TRIM(as_raw_table.first_name), '') AND - coalesce(donors.last_name, '') = coalesce(TRIM(as_raw_table.last_name), '') AND - coalesce(donors.address_1, '') = coalesce(TRIM(as_raw_table.address_1), '') AND - coalesce(donors.address_2, '') = coalesce(TRIM(as_raw_table.address_2), '') AND - coalesce(donors.city, '') = coalesce(TRIM(as_raw_table.city), '') AND - coalesce(donors.state, '') = coalesce(TRIM(as_raw_table.state), '') AND - coalesce(donors.employer, '') = coalesce(TRIM(as_raw_table.employer), '') AND - coalesce(donors.occupation , '')= coalesce(TRIM(as_raw_table.occupation), '') AND - coalesce(donors.zip, '') = coalesce(TRIM(as_raw_table.zip), '')""" + FROM raw_table JOIN donors donors ON + coalesce(donors.first_name, '') = coalesce(TRIM(raw_table.first_name), '') AND + coalesce(donors.last_name, '') = coalesce(TRIM(raw_table.last_name), '') AND + coalesce(donors.address_1, '') = coalesce(TRIM(raw_table.address_1), '') AND + coalesce(donors.address_2, '') = coalesce(TRIM(raw_table.address_2), '') AND + coalesce(donors.city, '') = coalesce(TRIM(raw_table.city), '') AND + coalesce(donors.state, '') = coalesce(TRIM(raw_table.state), '') AND + coalesce(donors.employer, '') = coalesce(TRIM(raw_table.employer), '') AND + coalesce(donors.occupation , '')= coalesce(TRIM(raw_table.occupation), '') AND + coalesce(donors.zip, '') = coalesce(TRIM(raw_table.zip), '')""" athenautils.athena_start_query(q, database=config.DATABASE) q = """ -CREATE TABLE as_processed_donors AS +CREATE TABLE processed_donors AS SELECT donor_id, LOWER(city) AS city, CASE WHEN (first_name IS NULL AND last_name IS NULL) @@ -193,7 +193,7 @@ LOWER(occupation) AS occupation, LOWER(employer) AS employer, first_name is null AS person - FROM as_donors""" + FROM donors""" athenautils.athena_start_query(q, database=config.DATABASE) From 8a28655e4f1b863e691cd583d87f0969d5184f9f Mon Sep 17 00:00:00 2001 From: asajadi Date: Sat, 8 May 2021 17:33:42 -0400 Subject: [PATCH 18/19] modifying requirements.txt --- athena_example/requirements.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/athena_example/requirements.txt b/athena_example/requirements.txt index 18c098ae..5dcfecc9 100644 --- a/athena_example/requirements.txt +++ b/athena_example/requirements.txt @@ -1 +1,3 @@ -mysqlclient +pandas +boto3 +dedupe From 736ff0b8520ebe117990bdaa23fbbd7916e28c13 Mon Sep 17 00:00:00 2001 From: asajadi Date: Sat, 8 May 2021 22:57:02 -0400 Subject: [PATCH 19/19] linting --- athena_example/athena_example.py | 290 +++++++++++++++++-------------- athena_example/athena_init.py | 283 +++++++++++++++++------------- athena_example/athenautils.py | 229 +++++++++++++++--------- 3 files changed, 468 insertions(+), 334 deletions(-) diff --git a/athena_example/athena_example.py b/athena_example/athena_example.py index 04837157..d275af51 100644 --- a/athena_example/athena_example.py +++ b/athena_example/athena_example.py @@ -1,4 +1,3 @@ - """ This is an example of working with very large data. There are about 700,000 unduplicated donors in this database of Illinois political @@ -16,46 +15,59 @@ [csv_example](csv_example.html) """ -# There is a little bit difference between the result +# There is a little bit difference between the result # of this module and the athena one. The reason is due to -# Some special (and mostly erroneous) characters, such as \a .. +# Some special (and mostly erroneous) characters, such as \a .. # Which are dealt with differently by athena and athena/panda +import athenautils +import config import sys import os -import itertools import time import logging import optparse import locale import json -from io import StringIO -import csv -import pandas as pd -import boto3 import dedupe import dedupe.backport -sys.path.insert(0, '../athena_example/') -import config -sys.path.insert(0, '../athena_example/') -import athenautils + +sys.path.insert(0, "../athena_example/") + +sys.path.insert(0, "../athena_example/") + def cursor_execute(query, database): - ''' + """ The MySQL compatible Cursor - ''' - return athenautils.cursor_execute(query, database=database, - cursortype='tuple', buffersize=config.BUFFERSIZE, - escapechar=None, keep_default_na=False, na_values=['']) + """ + return athenautils.cursor_execute( + query, + database=database, + cursortype="tuple", + buffersize=config.BUFFERSIZE, + escapechar=None, + keep_default_na=False, + na_values=[""], + ) + def dict_cursor_execute(query, database): - ''' + """ The MySQL compatible DicCursor - ''' - return athenautils.cursor_execute(query, database=database, - cursortype='dict', buffersize=config.BUFFERSIZE, - escapechar=None, keep_default_na=False, na_values=['']) + """ + return athenautils.cursor_execute( + query, + database=database, + cursortype="dict", + buffersize=config.BUFFERSIZE, + escapechar=None, + keep_default_na=False, + na_values=[""], + ) + + def record_pairs(result_set): for i, row in enumerate(result_set): a_record_id, a_record, b_record_id, b_record = row @@ -76,18 +88,22 @@ def cluster_ids(clustered_dupes): yield donor_id, cluster_id, score -if __name__ == '__main__': +if __name__ == "__main__": - ## Logging + # Logging # Dedupe uses Python logging to show or suppress verbose output. Added # for convenience. To enable verbose output, run `python # examples/athena_example/athena_example.py -v` - + optp = optparse.OptionParser() - optp.add_option('-v', '--verbose', dest='verbose', action='count', - help='Increase verbosity (specify multiple times for more)' - ) + optp.add_option( + "-v", + "--verbose", + dest="verbose", + action="count", + help="Increase verbosity (specify multiple times for more)", + ) (opts, args) = optp.parse_args() log_level = logging.WARNING if opts.verbose: @@ -96,14 +112,10 @@ def cluster_ids(clustered_dupes): elif opts.verbose >= 2: log_level = logging.DEBUG - logging.getLogger().setLevel(log_level) - - - - settings_file = 'athena_example_settings' - training_file = 'athena_example_training.json' + settings_file = "athena_example_settings" + training_file = "athena_example_training.json" start_time = time.time() @@ -111,15 +123,15 @@ def cluster_ids(clustered_dupes): # in campaign donor info. # # We did a fair amount of preprocessing of the fields in - # `athena_init_db.py` + # `athena_init_db.py` DONOR_SELECT = """SELECT donor_id, city, name, zip, state, address from processed_donors""" # ## Training if os.path.exists(settings_file): - print('reading from ', settings_file) - with open(settings_file, 'rb') as sf: + print("reading from ", settings_file) + with open(settings_file, "rb") as sf: deduper = dedupe.StaticDedupe(sf, num_cores=4) else: # Define the fields dedupe will pay attention to @@ -127,13 +139,13 @@ def cluster_ids(clustered_dupes): # The address, city, and zip fields are often missing, so we'll # tell dedupe that, and we'll learn a model that take that into # account - fields = [{'field': 'name', 'type': 'String'}, - {'field': 'address', 'type': 'String', - 'has missing': True}, - {'field': 'city', 'type': 'ShortString', 'has missing': True}, - {'field': 'state', 'type': 'ShortString', 'has missing': True}, - {'field': 'zip', 'type': 'ShortString', 'has missing': True}, - ] + fields = [ + {"field": "name", "type": "String"}, + {"field": "address", "type": "String", "has missing": True}, + {"field": "city", "type": "ShortString", "has missing": True}, + {"field": "state", "type": "ShortString", "has missing": True}, + {"field": "zip", "type": "ShortString", "has missing": True}, + ] # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(fields, num_cores=4) @@ -141,7 +153,6 @@ def cluster_ids(clustered_dupes): # We will sample pairs from the entire donor table for training cur = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE) temp_d = {i: row for i, row in enumerate(cur)} - # If we have training data saved from a previous run of dedupe, # look for it an load it in. @@ -149,7 +160,7 @@ def cluster_ids(clustered_dupes): # __Note:__ if you want to train from # scratch, delete the training_file if os.path.exists(training_file): - print('reading labeled examples from ', training_file) + print("reading labeled examples from ", training_file) with open(training_file) as tf: deduper.prepare_training(temp_d, training_file=tf) else: @@ -159,7 +170,7 @@ def cluster_ids(clustered_dupes): # ## Active learning - print('starting active labeling...') + print("starting active labeling...") # Starts the training loop. Dedupe will find the next pair of records # it is least certain about and ask you to label them as duplicates # or not. @@ -168,7 +179,7 @@ def cluster_ids(clustered_dupes): # press 'f' when you are finished dedupe.convenience.console_label(deduper) # When finished, save our labeled, training pairs to disk - with open(training_file, 'w') as tf: + with open(training_file, "w") as tf: deduper.write_training(tf) # Notice our the argument here @@ -178,7 +189,7 @@ def cluster_ids(clustered_dupes): # too many blocks and too many comparisons. deduper.train(recall=0.90) - with open(settings_file, 'wb') as sf: + with open(settings_file, "wb") as sf: deduper.write_settings(sf) # We can now remove some of the memory hobbing objects we used @@ -187,66 +198,77 @@ def cluster_ids(clustered_dupes): # ## Blocking - print('blocking...') + print("blocking...") # To run blocking on such a large set of data, we create a separate table # that contains blocking keys and record ids - print('creating blocking_map database') - athenautils.drop_external_table("blocking_map", - location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'blocking_map'), - database=config.DATABASE) + print("creating blocking_map database") + athenautils.drop_external_table( + "blocking_map", + location="s3://{}/{}".format( + config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY + "blocking_map" + ), + database=config.DATABASE, + ) - q=""" - CREATE EXTERNAL TABLE blocking_map + q = """ + CREATE EXTERNAL TABLE blocking_map (block_key VARCHAR(200), donor_id INTEGER) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' - LINES TERMINATED BY '\n' + LINES TERMINATED BY '\n' LOCATION - 's3://{}/{}' + 's3://{}/{}' TBLPROPERTIES ( - 'classification'='csv', - --'skip.header.line.count'='1', + 'classification'='csv', + --'skip.header.line.count'='1', 'serialization.null.format'='') - """.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'blocking_map') + """.format( + config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY + "blocking_map" + ) athenautils.athena_start_query(q, database=config.DATABASE) # If dedupe learned a Index Predicate, we have to take a pass # through the data and create indices. - print('creating inverted index') + print("creating inverted index") - # Armin: + # Armin: # This never runs, index_fields is empty, possible bug? for field in deduper.fingerprinter.index_fields: q = """ SELECT DISTINCT {field} FROM processed_donors WHERE {field} IS NOT NULL - """.format(field=field) + """.format( + field=field + ) cur = dict_cursor_execute(q, databse=config.DATABASE) field_data = (row[field] for row in cur) deduper.fingerprinter.index(field_data, field) - # Now we are ready to write our blocking map table by creating a # generator that yields unique `(block_key, donor_id)` tuples. - print('writing blocking map') - - read_cur = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE) - full_data = ((row['donor_id'], row) for row in read_cur) + print("writing blocking map") - b_data = deduper.fingerprinter(full_data) - athenautils.write_many(b_data, - filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'blocking_map/blocking.csv')) + read_cur = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE) + full_data = ((row["donor_id"], row) for row in read_cur) + b_data = deduper.fingerprinter(full_data) + athenautils.write_many( + b_data, + filename="s3://{}/{}".format( + config.DATABASE_BUCKET, + config.DATABASE_ROOT_KEY + "blocking_map/blocking.csv", + ), + ) # select unique pairs to compare - q=""" + q = """ SELECT a.donor_id, json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], ARRAY[ a.city, a.name, a.zip, a.state, a.address]) AS JSON)), b.donor_id, - json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], + json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], ARRAY[ b.city, b.name, b.zip, b.state, b.address]) AS JSON)) FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west @@ -259,40 +281,51 @@ def cluster_ids(clustered_dupes): """ read_cur = cursor_execute(q, database=config.DATABASE) - # ## Clustering - print('clustering...') - clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)), - threshold=0.5) - -# athenautils.athena_start_query("DROP TABLE IF EXISTS entity_map", database=config.DATABASE) - athenautils.drop_external_table("entity_map", - location='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'entity_map/'), - database=config.DATABASE) - - print('creating entity_map database') - q=""" - CREATE EXTERNAL TABLE entity_map - (donor_id INTEGER, canon_id INTEGER, + print("clustering...") + clustered_dupes = deduper.cluster( + deduper.score(record_pairs(read_cur)), threshold=0.5 + ) + + # athenautils.athena_start_query("DROP TABLE IF EXISTS entity_map", database=config.DATABASE) + athenautils.drop_external_table( + "entity_map", + location="s3://{}/{}".format( + config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY + "entity_map/" + ), + database=config.DATABASE, + ) + + print("creating entity_map database") + q = """ + CREATE EXTERNAL TABLE entity_map + (donor_id INTEGER, canon_id INTEGER, cluster_score FLOAT) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' - LINES TERMINATED BY '\n' + LINES TERMINATED BY '\n' LOCATION - 's3://{}/{}' + 's3://{}/{}' TBLPROPERTIES ( - 'classification'='csv', - --'skip.header.line.count'='1', + 'classification'='csv', + --'skip.header.line.count'='1', 'serialization.null.format'='') - """.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'entity_map') - athenautils.athena_start_query(q, database=config.DATABASE) + """.format( + config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY + "entity_map" + ) + athenautils.athena_start_query(q, database=config.DATABASE) - athenautils.write_many(cluster_ids(clustered_dupes), - filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'entity_map/entity_map.csv')) + athenautils.write_many( + cluster_ids(clustered_dupes), + filename="s3://{}/{}".format( + config.DATABASE_BUCKET, + config.DATABASE_ROOT_KEY + "entity_map/entity_map.csv", + ), + ) # Print out the number of duplicates found - print('# duplicate sets') + print("# duplicate sets") # ## Payoff @@ -301,29 +334,32 @@ def cluster_ids(clustered_dupes): # # For example, let's see who the top 10 donors are. - locale.setlocale(locale.LC_ALL, 'en_CA.UTF-8') # for pretty printing numbers - - athenautils.athena_start_query("DROP TABLE IF EXISTS e_map", database=config.DATABASE) - + # for pretty printing numbers + locale.setlocale(locale.LC_ALL, "en_CA.UTF-8") + + athenautils.athena_start_query( + "DROP TABLE IF EXISTS e_map", database=config.DATABASE + ) + q = """ - CREATE TABLE e_map as - SELECT COALESCE(canon_id, entity_map.donor_id) AS canon_id, entity_map.donor_id - FROM entity_map - RIGHT JOIN donors USING(donor_id) - """ + CREATE TABLE e_map as + SELECT COALESCE(canon_id, entity_map.donor_id) AS canon_id, entity_map.donor_id + FROM entity_map + RIGHT JOIN donors USING(donor_id) + """ athenautils.athena_start_query(q, database=config.DATABASE) - + q = """ - SELECT array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name, - donation_totals.totals AS totals - FROM donors INNER JOIN - (SELECT canon_id, SUM(cast (amount as double)) AS totals - FROM contributions INNER JOIN e_map - USING (donor_id) - GROUP BY (canon_id) - ORDER BY totals - DESC LIMIT 10) - AS donation_totals + SELECT array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name, + donation_totals.totals AS totals + FROM donors INNER JOIN + (SELECT canon_id, SUM(cast (amount as double)) AS totals + FROM contributions INNER JOIN e_map + USING (donor_id) + GROUP BY (canon_id) + ORDER BY totals + DESC LIMIT 10) + AS donation_totals ON donors.donor_id = donation_totals.canon_id ORDER BY totals DESC """ @@ -331,31 +367,31 @@ def cluster_ids(clustered_dupes): print("Top Donors (deduped)") for row in cur: - row['totals'] = locale.currency(row['totals'], grouping=True) - print('%(totals)20s: %(name)s' % row) + row["totals"] = locale.currency(row["totals"], grouping=True) + print("%(totals)20s: %(name)s" % row) # Compare this to what we would have gotten if we hadn't done any # deduplication q = """ with donorscontributions as( - SELECT donors.donor_id, + SELECT donors.donor_id, array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name, cast(contributions.amount as double) as amount - FROM donors INNER JOIN contributions - USING (donor_id) + FROM donors INNER JOIN contributions + USING (donor_id) ) - SELECT name, sum(amount) AS totals + SELECT name, sum(amount) AS totals FROM donorscontributions GROUP BY donor_id, name - ORDER BY totals DESC + ORDER BY totals DESC LIMIT 10 """ cur = dict_cursor_execute(q, database=config.DATABASE) print("Top Donors (raw)") for row in cur: - row['totals'] = locale.currency(row['totals'], grouping=True) - print('%(totals)20s: %(name)s' % row) + row["totals"] = locale.currency(row["totals"], grouping=True) + print("%(totals)20s: %(name)s" % row) - print('ran in', time.time() - start_time, 'seconds') + print("ran in", time.time() - start_time, "seconds") diff --git a/athena_example/athena_init.py b/athena_example/athena_init.py index 099c5489..42108846 100644 --- a/athena_example/athena_init.py +++ b/athena_example/athena_init.py @@ -3,10 +3,10 @@ This is a setup script for athena_example. It downloads a zip file of Illinois campaign contributions and loads them into a Athena database named 'contributions'. - + __Note:__ You will need to run this script first before execuing [athena_example.py](athena_example.py). - + Tables created: * raw_table - raw import of entire CSV file * donors - all distinct donors based on name and address @@ -14,189 +14,226 @@ * contributions - contribution amounts tied to donor and recipients tables """ +import athenautils import os import zipfile -import warnings import pandas as pd import numpy as np from urllib.request import urlopen -import boto3 import config import csv import sys -sys.path.insert(0, '../athena_example/') -import athenautils +sys.path.insert(0, "../athena_example/") -contributions_zip_file = 'Illinois-campaign-contributions.txt.zip' -contributions_txt_file = 'Illinois-campaign-contributions.txt' -if not os.path.exists(contributions_zip_file) : - print('downloading', contributions_zip_file, '(~60mb) ...') - u = urlopen('https://s3.amazonaws.com/dedupe-data/Illinois-campaign-contributions.txt.zip') - localFile = open(contributions_zip_file, 'wb') +contributions_zip_file = "Illinois-campaign-contributions.txt.zip" +contributions_txt_file = "Illinois-campaign-contributions.txt" + +if not os.path.exists(contributions_zip_file): + print("downloading", contributions_zip_file, "(~60mb) ...") + u = urlopen( + "https://s3.amazonaws.com/dedupe-data/Illinois-campaign-contributions.txt.zip" + ) + localFile = open(contributions_zip_file, "wb") localFile.write(u.read()) localFile.close() -if not os.path.exists(contributions_txt_file) : - zip_file = zipfile.ZipFile(contributions_zip_file, 'r') - print('extracting %s' % contributions_zip_file) +if not os.path.exists(contributions_txt_file): + zip_file = zipfile.ZipFile(contributions_zip_file, "r") + print("extracting %s" % contributions_zip_file) zip_file_contents = zip_file.namelist() for f in zip_file_contents: - if ('.txt' in f): + if ".txt" in f: zip_file.extract(f) zip_file.close() - - -print('importing raw data from csv...') -athenautils.drop_external_table("raw_table", - location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'raw_table'), - database=config.DATABASE) +print("importing raw data from csv...") +athenautils.drop_external_table( + "raw_table", + location="s3://{}/{}".format( + config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY + "raw_table" + ), + database=config.DATABASE, +) athenautils.athena_start_query("DROP TABLE IF EXISTS donors", database=config.DATABASE) -athenautils.athena_start_query("DROP TABLE IF EXISTS recipients", database=config.DATABASE) -athenautils.athena_start_query("DROP TABLE IF EXISTS contributions", database=config.DATABASE) -athenautils.athena_start_query("DROP TABLE IF EXISTS processed_donors", database=config.DATABASE) - - -q=r""" -CREATE EXTERNAL TABLE raw_table - (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), - address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), - state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), - date_recieved VARCHAR(10), loan_amount VARCHAR(12), - amount VARCHAR(23), receipt_type VARCHAR(23), - employer VARCHAR(70), occupation VARCHAR(40), - vendor_last_name VARCHAR(70), vendor_first_name VARCHAR(20), - vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), - vendor_city VARCHAR(20), vendor_state VARCHAR(10), - vendor_zip VARCHAR(10), description VARCHAR(90), - election_type VARCHAR(10), election_year VARCHAR(10), - report_period_begin VARCHAR(10), report_period_end VARCHAR(33), - committee_name VARCHAR(70), committee_id VARCHAR(37)) +athenautils.athena_start_query( + "DROP TABLE IF EXISTS recipients", database=config.DATABASE +) +athenautils.athena_start_query( + "DROP TABLE IF EXISTS contributions", database=config.DATABASE +) +athenautils.athena_start_query( + "DROP TABLE IF EXISTS processed_donors", database=config.DATABASE +) + + +q = r""" +CREATE EXTERNAL TABLE raw_table + (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), + address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), + state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), + date_recieved VARCHAR(10), loan_amount VARCHAR(12), + amount VARCHAR(23), receipt_type VARCHAR(23), + employer VARCHAR(70), occupation VARCHAR(40), + vendor_last_name VARCHAR(70), vendor_first_name VARCHAR(20), + vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), + vendor_city VARCHAR(20), vendor_state VARCHAR(10), + vendor_zip VARCHAR(10), description VARCHAR(90), + election_type VARCHAR(10), election_year VARCHAR(10), + report_period_begin VARCHAR(10), report_period_end VARCHAR(33), + committee_name VARCHAR(70), committee_id VARCHAR(37)) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' ESCAPED BY '\\' - LINES TERMINATED BY '\n' + LINES TERMINATED BY '\n' LOCATION - 's3://{}/{}' + 's3://{}/{}' TBLPROPERTIES ( - 'classification'='csv', - 'skip.header.line.count'='1', + 'classification'='csv', + 'skip.header.line.count'='1', 'serialization.null.format'='') -""".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'raw_table') +""".format( + config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY + "raw_table" +) athenautils.athena_start_query(q, database=config.DATABASE) -df_cursor = pd.read_csv(contributions_txt_file, sep='\t', escapechar='\\', quoting=csv.QUOTE_NONE, - error_bad_lines=False, warn_bad_lines=True, dtype=str, keep_default_na=False, na_values=[''], - chunksize=config.BUFFERSIZE) +df_cursor = pd.read_csv( + contributions_txt_file, + sep="\t", + escapechar="\\", + quoting=csv.QUOTE_NONE, + error_bad_lines=False, + warn_bad_lines=True, + dtype=str, + keep_default_na=False, + na_values=[""], + chunksize=config.BUFFERSIZE, +) chunkcount = 0 -filename=os.path.join("s3://", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'raw_table', os.path.splitext(contributions_txt_file)[0]+'.csv') -for df in df_cursor: - # Remove the very few records that mess up the demo +filename = os.path.join( + "s3://", + config.DATABASE_BUCKET, + config.DATABASE_ROOT_KEY, + "raw_table", + os.path.splitext(contributions_txt_file)[0] + ".csv", +) +for df in df_cursor: + # Remove the very few records that mess up the demo # (demo purposes only! Don't do something like this in production) - df = df[df['RcvDate'].str.len()>=10] + df = df[df["RcvDate"].str.len() >= 10] # set empty, non-zero, strings in date columns to null - df.loc[df['RptPdBegDate'].str.len()<10,'RptPdBegDate'] = np.nan + df.loc[df["RptPdBegDate"].str.len() < 10, "RptPdBegDate"] = np.nan - df.loc[df['RptPdEndDate'].str.len()<10,'RptPdEndDate'] = np.nan + df.loc[df["RptPdEndDate"].str.len() < 10, "RptPdEndDate"] = np.nan - #committee ID is requred. Remove the 2 rows that don't have it. - df = df[df['ID']!=''] + # committee ID is requred. Remove the 2 rows that don't have it. + df = df[df["ID"] != ""] - # There's a record with a date stuck in the committee_id column, which causes - # problems when inserting into the contributions table below. Get rid of it this - # way. - df = df[df['ID'].str.len() <=9] + # There's a record with a date stuck in the committee_id column, + # which causes problems when inserting into the contributions table below. + # Get rid of it this way. - # dropping the last columns - df = df.drop(columns='Unnamed: 29') + df = df[df["ID"].str.len() <= 9] - df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand') - - buffer = df_lower.to_csv(quoting=csv.QUOTE_NONE, sep="\t", escapechar='\\', index=None) - - chunk_fname = athenautils.file_name_append(filename, '_{}'.format(chunkcount), ommitext=False) + # dropping the last columns + df = df.drop(columns="Unnamed: 29") + + df_lower = df.apply( + lambda x: x.str.lower() + .str.normalize("NFKD") + .str.encode("ascii", errors="ignore") + .str.decode("utf-8") + if x.dtype == "object" + else x, + result_type="expand", + ) + + buffer = df_lower.to_csv( + quoting=csv.QUOTE_NONE, sep="\t", escapechar="\\", index=None + ) + + chunk_fname = athenautils.file_name_append( + filename, "_{}".format(chunkcount), ommitext=False + ) athenautils.write(body=buffer, filename=chunk_fname) - chunkcount += 1 - -print('creating donors table...') -q=""" + chunkcount += 1 + +print("creating donors table...") +q = """ CREATE TABLE donors as with tmp as - (SELECT DISTINCT - NULLIF(TRIM(last_name), '') as last_name, - NULLIF(TRIM(first_name), '') as first_name, - NULLIF(TRIM(address_1), '') as address_1, - NULLIF(TRIM(address_2), '') as address_2, - NULLIF(TRIM(city), '') city, - NULLIF(TRIM(state), '') as state, - NULLIF(TRIM(zip), '') as zip, - NULLIF(TRIM(employer), '') as employer, + (SELECT DISTINCT + NULLIF(TRIM(last_name), '') as last_name, + NULLIF(TRIM(first_name), '') as first_name, + NULLIF(TRIM(address_1), '') as address_1, + NULLIF(TRIM(address_2), '') as address_2, + NULLIF(TRIM(city), '') city, + NULLIF(TRIM(state), '') as state, + NULLIF(TRIM(zip), '') as zip, + NULLIF(TRIM(employer), '') as employer, NULLIF(TRIM(occupation), '') as occupation FROM raw_table) SELECT row_number() over () as donor_id, * from tmp""" athenautils.athena_start_query(q, database=config.DATABASE) -q=""" +q = """ CREATE TABLE recipients as SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM raw_table """ athenautils.athena_start_query(q, database=config.DATABASE) -print('creating contributions table') +print("creating contributions table") -q=""" +q = """ CREATE TABLE contributions as - SELECT reciept_id as contribution_id, - donors.donor_id as donor_id , - committee_id as recipient_id, - report_type, date_parse(date_recieved, '%m/%d/%Y') as date_recieved, - loan_amount, amount, - receipt_type, vendor_last_name , - vendor_first_name, vendor_address_1, vendor_address_2, - vendor_city, vendor_state, vendor_zip, description, - election_type, election_year, - date_parse(report_period_begin, '%m/%d/%Y') as report_period_begin, - date_parse(report_period_end, '%m/%d/%Y') as report_period_end - FROM raw_table JOIN donors donors ON - coalesce(donors.first_name, '') = coalesce(TRIM(raw_table.first_name), '') AND - coalesce(donors.last_name, '') = coalesce(TRIM(raw_table.last_name), '') AND - coalesce(donors.address_1, '') = coalesce(TRIM(raw_table.address_1), '') AND - coalesce(donors.address_2, '') = coalesce(TRIM(raw_table.address_2), '') AND - coalesce(donors.city, '') = coalesce(TRIM(raw_table.city), '') AND - coalesce(donors.state, '') = coalesce(TRIM(raw_table.state), '') AND - coalesce(donors.employer, '') = coalesce(TRIM(raw_table.employer), '') AND - coalesce(donors.occupation , '')= coalesce(TRIM(raw_table.occupation), '') AND + SELECT reciept_id as contribution_id, + donors.donor_id as donor_id , + committee_id as recipient_id, + report_type, date_parse(date_recieved, '%m/%d/%Y') as date_recieved, + loan_amount, amount, + receipt_type, vendor_last_name , + vendor_first_name, vendor_address_1, vendor_address_2, + vendor_city, vendor_state, vendor_zip, description, + election_type, election_year, + date_parse(report_period_begin, '%m/%d/%Y') as report_period_begin, + date_parse(report_period_end, '%m/%d/%Y') as report_period_end + FROM raw_table JOIN donors donors ON + coalesce(donors.first_name, '') = coalesce(TRIM(raw_table.first_name), '') AND + coalesce(donors.last_name, '') = coalesce(TRIM(raw_table.last_name), '') AND + coalesce(donors.address_1, '') = coalesce(TRIM(raw_table.address_1), '') AND + coalesce(donors.address_2, '') = coalesce(TRIM(raw_table.address_2), '') AND + coalesce(donors.city, '') = coalesce(TRIM(raw_table.city), '') AND + coalesce(donors.state, '') = coalesce(TRIM(raw_table.state), '') AND + coalesce(donors.employer, '') = coalesce(TRIM(raw_table.employer), '') AND + coalesce(donors.occupation , '')= coalesce(TRIM(raw_table.occupation), '') AND coalesce(donors.zip, '') = coalesce(TRIM(raw_table.zip), '')""" athenautils.athena_start_query(q, database=config.DATABASE) q = """ -CREATE TABLE processed_donors AS - SELECT donor_id, - LOWER(city) AS city, - CASE WHEN (first_name IS NULL AND last_name IS NULL) - THEN NULL - ELSE LOWER(array_join(filter(array[first_name, last_name], x-> x IS NOT NULL), ' ')) - END AS name, - LOWER(zip) AS zip, - LOWER(state) AS state, - CASE WHEN (address_1 IS NULL AND address_2 IS NULL) - THEN NULL +CREATE TABLE processed_donors AS + SELECT donor_id, + LOWER(city) AS city, + CASE WHEN (first_name IS NULL AND last_name IS NULL) + THEN NULL + ELSE LOWER(array_join(filter(array[first_name, last_name], x-> x IS NOT NULL), ' ')) + END AS name, + LOWER(zip) AS zip, + LOWER(state) AS state, + CASE WHEN (address_1 IS NULL AND address_2 IS NULL) + THEN NULL ELSE LOWER(array_join(filter(array[address_1, address_2], x-> x IS NOT NULL), ' ')) - END AS address, - LOWER(occupation) AS occupation, - LOWER(employer) AS employer, - first_name is null AS person + END AS address, + LOWER(occupation) AS occupation, + LOWER(employer) AS employer, + first_name is null AS person FROM donors""" athenautils.athena_start_query(q, database=config.DATABASE) - - -print('done') +print("done") diff --git a/athena_example/athenautils.py b/athena_example/athenautils.py index a88463e2..9a68f367 100644 --- a/athena_example/athenautils.py +++ b/athena_example/athenautils.py @@ -1,71 +1,107 @@ from __future__ import print_function +import config import re import boto3 import botocore import sys -import datetime import os import time +from os import listdir +import shutil import pandas as pd -from six import string_types -import sys + + pyver = sys.version_info[0] -if pyver<3: +if pyver < 3: from StringIO import StringIO as SomethingIO from urlparse import urlparse else: from io import BytesIO as SomethingIO from urllib.parse import urlparse - -sys.path.insert(0, '../athena_example/') -import config -s3 = boto3.client('s3', region_name=config.REGION, - aws_access_key_id=config.ACCESS_KEY_ID, aws_secret_access_key=config.SECRET_ACCESS_KEY) - -athena = boto3.client('athena', region_name=config.REGION, - aws_access_key_id=config.ACCESS_KEY_ID, aws_secret_access_key=config.SECRET_ACCESS_KEY) - -def cursor_execute(query, database=None, cursortype='tuple', buffersize=1000000, - output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, - **kwargs): - - kwargs['chunksize']=buffersize - df_cur = athena_to_panda(query, database=database, - output_location=output_location, region=region, workgroup=workgroup, - **kwargs) +sys.path.insert(0, "../athena_example/") + +s3 = boto3.client( + "s3", + region_name=config.REGION, + aws_access_key_id=config.ACCESS_KEY_ID, + aws_secret_access_key=config.SECRET_ACCESS_KEY, +) + +athena = boto3.client( + "athena", + region_name=config.REGION, + aws_access_key_id=config.ACCESS_KEY_ID, + aws_secret_access_key=config.SECRET_ACCESS_KEY, +) + + +def cursor_execute( + query, + database=None, + cursortype="tuple", + buffersize=1000000, + output_location=config.ATHENA_GARBAGE_PATH, + region=config.REGION, + workgroup=config.WORKGROUP, + **kwargs +): + + kwargs["chunksize"] = buffersize + df_cur = athena_to_panda( + query, + database=database, + output_location=output_location, + region=region, + workgroup=workgroup, + **kwargs + ) for df in df_cur: - if cursortype == 'dict': - all_rows = df.where(pd.notnull(df), None).to_dict('records') - if cursortype == 'tuple': + if cursortype == "dict": + all_rows = df.where(pd.notnull(df), None).to_dict("records") + if cursortype == "tuple": all_rows = df.where(pd.notnull(df), None).itertuples(index=False, name=None) for row in all_rows: yield row - - -def athena_to_panda(query, database=None, - output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, - **kwargs): - query_execution_id = athena_start_query(query, database=database, - output_location=output_location, region=region, workgroup=workgroup, - wait_until_finished=True) - df = pandas_read_csv(os.path.join(output_location, query_execution_id+'.csv'), **kwargs) + + +def athena_to_panda( + query, + database=None, + output_location=config.ATHENA_GARBAGE_PATH, + region=config.REGION, + workgroup=config.WORKGROUP, + **kwargs +): + query_execution_id = athena_start_query( + query, + database=database, + output_location=output_location, + region=region, + workgroup=workgroup, + wait_until_finished=True, + ) + df = pandas_read_csv( + os.path.join(output_location, query_execution_id + ".csv"), **kwargs + ) return df -def athena_start_query(query, database=None, - output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, - wait_until_finished=True): + +def athena_start_query( + query, + database=None, + output_location=config.ATHENA_GARBAGE_PATH, + region=config.REGION, + workgroup=config.WORKGROUP, + wait_until_finished=True, +): query_execution_id = athena.start_query_execution( QueryString=query, - QueryExecutionContext={ - 'Database': database - }, + QueryExecutionContext={"Database": database}, WorkGroup=workgroup, - ResultConfiguration={ - "OutputLocation": output_location - } - )['QueryExecutionId'] + ResultConfiguration={"OutputLocation": output_location}, + )["QueryExecutionId"] seconds_to_wait = 1 @@ -73,115 +109,140 @@ def athena_start_query(query, database=None, while True: time.sleep(seconds_to_wait) seconds_to_wait += 1 -# seconds_to_wait *= 2 + # seconds_to_wait *= 2 - execution = athena.get_query_execution( - QueryExecutionId=query_execution_id - ) + execution = athena.get_query_execution(QueryExecutionId=query_execution_id) - if execution['QueryExecution']['Status']['State'] not in ['QUEUED', 'RUNNING']: + if execution["QueryExecution"]["Status"]["State"] not in [ + "QUEUED", + "RUNNING", + ]: break - if execution['QueryExecution']['Status']['State'] != 'SUCCEEDED': - raise Exception("Athena query failed: %s" % ( execution['QueryExecution']['Status']['StateChangeReason'],), query_execution_id) + if execution["QueryExecution"]["Status"]["State"] != "SUCCEEDED": + raise Exception( + "Athena query failed: %s" + % (execution["QueryExecution"]["Status"]["StateChangeReason"],), + query_execution_id, + ) return query_execution_id -# Copied from https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py + +# Copied from +# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py # Import it instead, when it's updated. + + def is_s3_url(url): """Check for an s3, s3n, or s3a url""" try: return urlparse(url).scheme in ["s3", "s3n", "s3a"] except Exception: return False - + + def seperate_bucket_key(url): - m = re.match('s3://([^/]+)/(.*)', url) + m = re.match("s3://([^/]+)/(.*)", url) return m.group(1), m.group(2) + def list_all(path): if is_s3_url(path): bucket, key = seperate_bucket_key(path) objects = s3.list_objects_v2(Bucket=bucket, Prefix=key) - if not 'Contents' in objects: + if "Contents" not in objects: return [] - return [key['Key'] for key in objects['Contents']] - from os import listdir - from os.path import isfile, join + return [key["Key"] for key in objects["Contents"]] if not os.path.exists(path): return [] return listdir(path) + def del_all_files(path): filelist = list_all(path) if is_s3_url(path): bucket, key = seperate_bucket_key(path) for f in filelist: - s3.delete_object(Bucket=bucket, Key=f) + s3.delete_object(Bucket=bucket, Key=f) return filelist = [os.path.join(path, f) for f in filelist] for f in filelist: if os.path.isfile(f): os.remove(f) - else: - shutil.rmtree(f) - -def drop_external_table(tablename, location , database=None, - output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP): - athena_start_query('drop table if exists {}'.format(tablename), database=database, - output_location=output_location, region=region, workgroup=workgroup) + else: + shutil.rmtree(f) + + +def drop_external_table( + tablename, + location, + database=None, + output_location=config.ATHENA_GARBAGE_PATH, + region=config.REGION, + workgroup=config.WORKGROUP, +): + athena_start_query( + "drop table if exists {}".format(tablename), + database=database, + output_location=output_location, + region=region, + workgroup=workgroup, + ) del_all_files(location) - - def pandas_read_csv(filepath_or_buffer, **kwargs): bucket, key = seperate_bucket_key(filepath_or_buffer) obj = s3.get_object(Bucket=bucket, Key=key) - return pd.read_csv(SomethingIO(obj['Body'].read()), **kwargs) + return pd.read_csv(SomethingIO(obj["Body"].read()), **kwargs) + def read(filename): if is_s3_url(filename): bucket, key = seperate_bucket_key(filename) - obj=s3.get_object(Bucket=bucket, Key=key) - return obj['Body'].read() - with open (filename) as f: + obj = s3.get_object(Bucket=bucket, Key=key) + return obj["Body"].read() + with open(filename) as f: return f.read() + def write(body, filename): bucket, key = seperate_bucket_key(filename) s3.put_object(Bucket=bucket, Key=key, Body=body) return - + def file_name_append(filename, append, ommitext): - filename_base, ext = os.path.splitext(filename) - if ommitext: - return '%s%s' % (filename_base, append) - return '%s%s%s' % (filename_base, append, ext) + filename_base, ext = os.path.splitext(filename) + if ommitext: + return "%s%s" % (filename_base, append) + return "%s%s%s" % (filename_base, append, ext) + def write_many(read_cursor, filename, buffersize=config.BUFFERSIZE): - chunkcount=0 + chunkcount = 0 while True: buffer_df = pd.DataFrame.from_records(read_cursor, nrows=buffersize) - if buffer_df.empty: - break - buffer = buffer_df.to_csv(index=False, header=False, sep='\t') - chunk_fname = file_name_append(filename, '_{}'.format(chunkcount), ommitext=False) + if buffer_df.empty: + break + buffer = buffer_df.to_csv(index=False, header=False, sep="\t") + chunk_fname = file_name_append( + filename, "_{}".format(chunkcount), ommitext=False + ) write(buffer, chunk_fname) chunkcount += 1 - + + def file_exists(filename): bucket, key = seperate_bucket_key(filename) try: s3.get_object(Bucket=bucket, Key=key) except botocore.exceptions.ClientError as e: - if e.response['Error']['Code']=='NoSuchKey': + if e.response["Error"]["Code"] == "NoSuchKey": return False else: # Something else has gone wrong. raise else: return True -