From e04bb0edc47b3b6a6a42d1a0c9845afe26c6f6f2 Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-10-10-21-202.eu-west-1.compute.internal>
Date: Thu, 18 Jun 2020 03:25:55 +0000
Subject: [PATCH 01/19] initial commit-adding presto example

---
 notebooks/mysql_init_db.ipynb   | 594 ++++++++++++++++++++++++++++++++
 presto_example/README.md        |  23 ++
 presto_example/mysql.cnf_LOCAL  |   4 +
 presto_example/mysql_example.py | 344 ++++++++++++++++++
 presto_example/mysql_init_db.py | 234 +++++++++++++
 presto_example/requirements.txt |   1 +
 6 files changed, 1200 insertions(+)
 create mode 100644 notebooks/mysql_init_db.ipynb
 create mode 100644 presto_example/README.md
 create mode 100644 presto_example/mysql.cnf_LOCAL
 create mode 100644 presto_example/mysql_example.py
 create mode 100644 presto_example/mysql_init_db.py
 create mode 100644 presto_example/requirements.txt

diff --git a/notebooks/mysql_init_db.ipynb b/notebooks/mysql_init_db.ipynb
new file mode 100644
index 00000000..1dd956d9
--- /dev/null
+++ b/notebooks/mysql_init_db.ipynb
@@ -0,0 +1,594 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "athena_garbage = 's3://com.ria.scratch/athena_garbage/'\n",
+    "bucket='com.ria.scratch'\n",
+    "region='eu-west-1'\n",
+    "workgroup = 'RIA'\n",
+    "root_key='as-dedupe/'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %load ../mysql_example/mysql_init_db.py\n",
+    "#!/usr/bin/python\n",
+    "\"\"\"\n",
+    "This is a setup script for mysql_example.  It downloads a zip file of\n",
+    "Illinois campaign contributions and loads them into a MySQL database\n",
+    "named 'contributions'.\n",
+    " \n",
+    "__Note:__ You will need to run this script first before execuing\n",
+    "[mysql_example.py](mysql_example.html).\n",
+    " \n",
+    "Tables created:\n",
+    "* raw_table - raw import of entire CSV file\n",
+    "* donors - all distinct donors based on name and address\n",
+    "* recipients - all distinct campaign contribution recipients\n",
+    "* contributions - contribution amounts tied to donor and recipients tables\n",
+    "\"\"\"\n",
+    "\n",
+    "import os\n",
+    "import zipfile\n",
+    "import warnings\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from urllib.request import urlopen\n",
+    "import boto3\n",
+    "from pyathena import connect\n",
+    "\n",
+    "# import MySQLdb\n",
+    "\n",
+    "# warnings.filterwarnings('ignore', category=MySQLdb.Warning)\n",
+    "\n",
+    "contributions_zip_file = 'Illinois-campaign-contributions.txt.zip'\n",
+    "contributions_txt_file = 'Illinois-campaign-contributions.txt'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not os.path.exists(contributions_zip_file) :\n",
+    "    print('downloading', contributions_zip_file, '(~60mb) ...')\n",
+    "    u = urlopen('https://s3.amazonaws.com/dedupe-data/Illinois-campaign-contributions.txt.zip')\n",
+    "    localFile = open(contributions_zip_file, 'wb')\n",
+    "    localFile.write(u.read())\n",
+    "    localFile.close()\n",
+    "\n",
+    "if not os.path.exists(contributions_txt_file) :\n",
+    "    zip_file = zipfile.ZipFile(contributions_zip_file, 'r')\n",
+    "    print('extracting %s' % contributions_zip_file)\n",
+    "    zip_file_contents = zip_file.namelist()\n",
+    "    for f in zip_file_contents:\n",
+    "        if ('.txt' in f):\n",
+    "            zip_file.extract(f)\n",
+    "    zip_file.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# conn = MySQLdb.connect(read_default_file = os.path.abspath('.') + '/mysql.cnf', \n",
+    "#                        local_infile = 1,\n",
+    "#                        sql_mode=\"ALLOW_INVALID_DATES\",\n",
+    "#                        db='contributions')\n",
+    "# c = conn.cursor()\n",
+    "\n",
+    "s3 = boto3.client('s3')  \n",
+    "conn = connect(s3_staging_dir=athena_garbage,\n",
+    "                 region_name=region, work_group=workgroup)\n",
+    "c = conn.cursor()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "importing raw data from csv...\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<pyathena.cursor.Cursor at 0x7f2aaeeb8ef0>"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "print('importing raw data from csv...')\n",
+    "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.raw_table\")\n",
+    "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.donors\")\n",
+    "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.recipients\")\n",
+    "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.contributions\")\n",
+    "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.processed_donors\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<pyathena.cursor.Cursor at 0x7f2aaeeb8ef0>"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# c.execute(\"CREATE TABLE raw_table \"\n",
+    "#           \"(reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), \"\n",
+    "#           \" address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), \"\n",
+    "#           \" state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), \"\n",
+    "#           \" date_recieved VARCHAR(10), loan_amount VARCHAR(12), \"\n",
+    "#           \" amount VARCHAR(23), receipt_type VARCHAR(23), \"\n",
+    "#           \" employer VARCHAR(70), occupation VARCHAR(40), \"\n",
+    "#           \" vendor_last_name VARCHAR(70), vendor_first_name VARCHAR(20), \"\n",
+    "#           \" vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), \"\n",
+    "#           \" vendor_city VARCHAR(20), vendor_state VARCHAR(10), \"\n",
+    "#           \" vendor_zip VARCHAR(10), description VARCHAR(90), \"\n",
+    "#           \" election_type VARCHAR(10), election_year VARCHAR(10), \"\n",
+    "#           \" report_period_begin VARCHAR(10), report_period_end VARCHAR(33), \"\n",
+    "#           \" committee_name VARCHAR(70), committee_id VARCHAR(37)) \"\n",
+    "#           \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n",
+    "\n",
+    "\n",
+    "# conn.commit()\n",
+    "q=r'''\n",
+    "CREATE EXTERNAL TABLE ria_data_science_s3.raw_table \n",
+    "    (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), \n",
+    "    address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), \n",
+    "    state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), \n",
+    "    date_recieved VARCHAR(10), loan_amount VARCHAR(12), \n",
+    "    amount VARCHAR(23), receipt_type VARCHAR(23), \n",
+    "    employer VARCHAR(70), occupation VARCHAR(40), \n",
+    "    vendor_last_name VARCHAR(70), vendor_first_name VARCHAR(20), \n",
+    "    vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), \n",
+    "    vendor_city VARCHAR(20), vendor_state VARCHAR(10), \n",
+    "    vendor_zip VARCHAR(10), description VARCHAR(90), \n",
+    "    election_type VARCHAR(10), election_year VARCHAR(10), \n",
+    "    report_period_begin VARCHAR(10), report_period_end VARCHAR(33), \n",
+    "    committee_name VARCHAR(70), committee_id VARCHAR(37)) \n",
+    "ROW FORMAT DELIMITED\n",
+    "  FIELDS TERMINATED BY '\\t'\n",
+    "  LINES TERMINATED BY '\\n'  \n",
+    "LOCATION\n",
+    "    's3://{}/{}' \n",
+    "TBLPROPERTIES (\n",
+    "    'classification'='csv', \n",
+    "    'skip.header.line.count'='1',  \n",
+    "    'serialization.null.format'='')\n",
+    "'''.format(bucket, root_key+'raw_table') \n",
+    "c.execute(q)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "b'Skipping line 1441352: expected 30 fields, saw 31\\n'\n",
+      "b'Skipping line 1465996: expected 30 fields, saw 31\\n'\n",
+      "b'Skipping line 1495732: expected 30 fields, saw 31\\n'\n",
+      "b'Skipping line 1631504: expected 30 fields, saw 31\\nSkipping line 1631506: expected 30 fields, saw 31\\n'\n",
+      "b'Skipping line 1660260: expected 30 fields, saw 31\\nSkipping line 1660264: expected 30 fields, saw 32\\n'\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'ResponseMetadata': {'RequestId': 'C8707997FC007A2B',\n",
+       "  'HostId': 'pD0pZDu7WHeyS6gGA9JAV11Ns6QUZ99Iqjskl4Pvgd2V9cxZf2ulF8azIOgJnvWQ0Tv+DSJniEw=',\n",
+       "  'HTTPStatusCode': 200,\n",
+       "  'HTTPHeaders': {'x-amz-id-2': 'pD0pZDu7WHeyS6gGA9JAV11Ns6QUZ99Iqjskl4Pvgd2V9cxZf2ulF8azIOgJnvWQ0Tv+DSJniEw=',\n",
+       "   'x-amz-request-id': 'C8707997FC007A2B',\n",
+       "   'date': 'Thu, 18 Jun 2020 03:05:54 GMT',\n",
+       "   'x-amz-server-side-encryption': 'AES256',\n",
+       "   'etag': '\"42fa5ce005b346df46ed9bb9aa8fb140\"',\n",
+       "   'content-length': '0',\n",
+       "   'server': 'AmazonS3'},\n",
+       "  'RetryAttempts': 0},\n",
+       " 'ETag': '\"42fa5ce005b346df46ed9bb9aa8fb140\"',\n",
+       " 'ServerSideEncryption': 'AES256'}"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# c.execute(\"LOAD DATA LOCAL INFILE %s INTO TABLE raw_table \"\n",
+    "#           \"FIELDS TERMINATED BY '\\t' LINES TERMINATED BY '\\r\\n' \" \n",
+    "#           \"IGNORE 1 LINES \"\n",
+    "#           \"(reciept_id, last_name, first_name, \"\n",
+    "#           \" address_1, address_2, city, state, \"\n",
+    "#           \" zip, report_type, date_recieved, \"\n",
+    "#           \" loan_amount, amount, receipt_type, \"\n",
+    "#           \" employer, occupation, vendor_last_name, \"\n",
+    "#           \" vendor_first_name, vendor_address_1, \"\n",
+    "#           \" vendor_address_2, vendor_city, vendor_state, \"\n",
+    "#           \" vendor_zip, description, election_type, \"\n",
+    "#           \" election_year, \"\n",
+    "#           \" report_period_begin, report_period_end, \"\n",
+    "#           \" committee_name, committee_id, @dummy)\",\n",
+    "#           (contributions_txt_file,))\n",
+    "\n",
+    "df = pd.read_csv(contributions_txt_file, sep='\\t', error_bad_lines=False, dtype=str, index_col=0)\n",
+    "# Remove the very few records that mess up the demo \n",
+    "# (demo purposes only! Don't do something like this in production)\n",
+    "# c.execute(\"DELETE FROM raw_table WHERE LENGTH(date_recieved) < 10\")\n",
+    "df = df[df['RcvDate'].str.len()>=10]\n",
+    "\n",
+    "# set empty, non-zero, strings in date columns to null\n",
+    "# c.execute(\"UPDATE raw_table SET report_period_begin = NULL WHERE LENGTH(report_period_begin) < 10\")\n",
+    "df.loc[df['RptPdBegDate'].str.len()<10,'RptPdBegDate'] = np.nan\n",
+    "\n",
+    "# c.execute(\"UPDATE raw_table SET report_period_end = NULL WHERE LENGTH(report_period_end) < 10\")\n",
+    "df.loc[df['RptPdEndDate'].str.len()<10,'RptPdEndDate'] = np.nan\n",
+    "\n",
+    "#committee ID is requred. Remove the 2 rows that don't have it.\n",
+    "# c.execute(\"DELETE FROM raw_table WHERE committee_id=''\");\n",
+    "df = df[df['ID'] != '']\n",
+    "\n",
+    "# There's a record with a date stuck in the committee_id column, which causes\n",
+    "# problems when inserting into the contributions table below. Get rid of it this \n",
+    "# way.\n",
+    "# c.execute(\"DELETE FROM raw_table WHERE LENGTH( committee_id ) > 9\")\n",
+    "df = df[df['ID'].str.len() <=9]\n",
+    "\n",
+    "# Nullifying empty strings\n",
+    "df = df.replace(r'^\\s*$', np.nan, regex=True)\n",
+    "\n",
+    "s3.put_object(Bucket=bucket, Key=root_key+'raw_table/'+contributions_txt_file, Body=df.to_csv(sep=\"\\t\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "creating donors table...\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<pyathena.cursor.Cursor at 0x7f2aaeeb8ef0>"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "print('creating donors table...')\n",
+    "# c.execute(\"CREATE TABLE donors \"\n",
+    "#           \"(donor_id INTEGER PRIMARY KEY AUTO_INCREMENT, \"\n",
+    "#           \" last_name VARCHAR(70), first_name VARCHAR(35), \"\n",
+    "#           \" address_1 VARCHAR(35), address_2 VARCHAR(36), \"\n",
+    "#           \" city VARCHAR(20), state VARCHAR(15), \"\n",
+    "#           \" zip VARCHAR(11), employer VARCHAR(70), \"\n",
+    "#           \" occupation VARCHAR(40)) \"\n",
+    "#           \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n",
+    "# c.execute(\"INSERT INTO donors \"\n",
+    "#           \"(first_name, last_name, address_1,\"\n",
+    "#           \" address_2, city, state, zip, employer, occupation) \"\n",
+    "#           \"SELECT DISTINCT \"\n",
+    "#           \"TRIM(first_name), TRIM(last_name), TRIM(address_1),  \"\n",
+    "#           \"TRIM(address_2), TRIM(city), TRIM(state), TRIM(zip), \"\n",
+    "#           \"TRIM(employer), TRIM(occupation) \"\n",
+    "#           \"FROM raw_table\")\n",
+    "# conn.commit()\n",
+    "q='''\n",
+    "CREATE TABLE ria_data_science_s3.donors as\n",
+    "    with tmp as\n",
+    "      (SELECT DISTINCT \n",
+    "           TRIM(last_name) as last_name, TRIM(first_name) as first_name, \n",
+    "           TRIM(address_1) as address_1, TRIM(address_2) as address_2, \n",
+    "           TRIM(city) city, TRIM(state) as state, \n",
+    "           TRIM(zip) as zip, TRIM(employer) as employer, \n",
+    "           TRIM(occupation) as occupation\n",
+    "      FROM ria_data_science_s3.raw_table)\n",
+    "    SELECT row_number() over () as donor_id, * from tmp'''\n",
+    "c.execute(q)\n",
+    "# print('creating indexes on donors table')\n",
+    "# c.execute(\"CREATE INDEX donors_donor_info ON donors \"\n",
+    "#           \"(last_name, first_name, address_1, address_2, city, \"\n",
+    "#           \" state, zip)\")\n",
+    "# conn.commit()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<pyathena.cursor.Cursor at 0x7f2aaeeb8ef0>"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# print('creating recipients table...')\n",
+    "# c.execute(\"CREATE TABLE recipients \"\n",
+    "#           \"(recipient_id INTEGER PRIMARY KEY AUTO_INCREMENT, name VARCHAR(70)) \"\n",
+    "#           \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n",
+    "\n",
+    "# c.execute(\"INSERT IGNORE INTO recipients \"\n",
+    "#           \"SELECT DISTINCT committee_id, committee_name FROM raw_table\")\n",
+    "# conn.commit()\n",
+    "\n",
+    "q='''\n",
+    "CREATE TABLE ria_data_science_s3.recipients as\n",
+    "    SELECT DISTINCT committee_id, committee_name FROM ria_data_science_s3.raw_table\n",
+    "'''\n",
+    "c.execute(q)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "creating contributions table\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<pyathena.cursor.Cursor at 0x7f2aaeeb8ef0>"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "print('creating contributions table')\n",
+    "# c.execute(\"CREATE TABLE contributions \"\n",
+    "#           \"(contribution_id INT, donor_id INT, recipient_id INT, \"\n",
+    "#           \" report_type VARCHAR(24), date_recieved DATE, \"\n",
+    "#           \" loan_amount VARCHAR(12), amount VARCHAR(23), \"\n",
+    "#           \" receipt_type VARCHAR(23), \"\n",
+    "#           \" vendor_last_name VARCHAR(70), \"\n",
+    "#           \" vendor_first_name VARCHAR(20), \"\n",
+    "#           \" vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), \"\n",
+    "#           \" vendor_city VARCHAR(20), vendor_state VARCHAR(10), \"\n",
+    "#           \" vendor_zip VARCHAR(10), description VARCHAR(90), \"\n",
+    "#           \" election_type VARCHAR(10), election_year VARCHAR(10), \"\n",
+    "#           \" report_period_begin DATE, report_period_end DATE) \"\n",
+    "#           \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n",
+    "\n",
+    "\n",
+    "# c.execute(\"INSERT INTO contributions \"\n",
+    "#           \"SELECT reciept_id, donors.donor_id, committee_id, \"\n",
+    "#           \" report_type, STR_TO_DATE(date_recieved, '%m/%d/%Y'), \"\n",
+    "#           \" loan_amount, amount, \"\n",
+    "#           \" receipt_type, vendor_last_name , \"\n",
+    "#           \" vendor_first_name, vendor_address_1, vendor_address_2, \"\n",
+    "#           \" vendor_city, vendor_state, vendor_zip, description, \"\n",
+    "#           \" election_type, election_year, \"\n",
+    "#           \" STR_TO_DATE(report_period_begin, '%m/%d/%Y'), \"\n",
+    "#           \" STR_TO_DATE(report_period_end, '%m/%d/%Y') \"\n",
+    "#           \"FROM raw_table JOIN donors ON \"\n",
+    "#           \"donors.first_name = TRIM(raw_table.first_name) AND \"\n",
+    "#           \"donors.last_name = TRIM(raw_table.last_name) AND \"\n",
+    "#           \"donors.address_1 = TRIM(raw_table.address_1) AND \"\n",
+    "#           \"donors.address_2 = TRIM(raw_table.address_2) AND \"\n",
+    "#           \"donors.city = TRIM(raw_table.city) AND \"\n",
+    "#           \"donors.state = TRIM(raw_table.state) AND \"\n",
+    "#           \"donors.employer = TRIM(raw_table.employer) AND \"\n",
+    "#           \"donors.occupation = TRIM(raw_table.occupation) AND \"\n",
+    "#           \"donors.zip = TRIM(raw_table.zip)\")\n",
+    "# conn.commit()\n",
+    "\n",
+    "q='''\n",
+    "CREATE TABLE ria_data_science_s3.contributions as\n",
+    "    SELECT reciept_id, donors.donor_id, committee_id, \n",
+    "        report_type, date_parse(date_recieved, '%m/%d/%Y') as date_recieved, \n",
+    "        loan_amount, amount, \n",
+    "        receipt_type, vendor_last_name , \n",
+    "        vendor_first_name, vendor_address_1, vendor_address_2, \n",
+    "        vendor_city, vendor_state, vendor_zip, description, \n",
+    "        election_type, election_year, \n",
+    "        date_parse(report_period_begin, '%m/%d/%Y') as report_period_begin, \n",
+    "        date_parse(report_period_end, '%m/%d/%Y') as report_period_end \n",
+    "    FROM ria_data_science_s3.raw_table JOIN ria_data_science_s3.donors ON \n",
+    "        donors.first_name = TRIM(raw_table.first_name) AND \n",
+    "        donors.last_name = TRIM(raw_table.last_name) AND \n",
+    "        donors.address_1 = TRIM(raw_table.address_1) AND \n",
+    "        donors.address_2 = TRIM(raw_table.address_2) AND \n",
+    "        donors.city = TRIM(raw_table.city) AND \n",
+    "        donors.state = TRIM(raw_table.state) AND \n",
+    "        donors.employer = TRIM(raw_table.employer) AND \n",
+    "        donors.occupation = TRIM(raw_table.occupation) AND \n",
+    "        donors.zip = TRIM(raw_table.zip)'''\n",
+    "c.execute(q)\n",
+    "\n",
+    "\n",
+    "# print('creating indexes on contributions')\n",
+    "# c.execute(\"ALTER TABLE contributions ADD PRIMARY KEY(contribution_id)\")\n",
+    "# c.execute(\"CREATE INDEX donor_idx ON contributions (donor_id)\")\n",
+    "# c.execute(\"CREATE INDEX recipient_idx ON contributions (recipient_id)\")\n",
+    "\n",
+    "\n",
+    "# conn.commit()\n",
+    "\n",
+    "# print('nullifying empty strings in donors')\n",
+    "# c.execute(\"UPDATE donors \"\n",
+    "#           \"SET \"\n",
+    "#           \"first_name = CASE first_name WHEN '' THEN NULL ELSE first_name END, \"\n",
+    "#           \"last_name = CASE last_name WHEN '' THEN NULL ELSE last_name END, \"\n",
+    "#           \"address_1 = CASE address_1 WHEN '' THEN NULL ELSE address_1 END, \"\n",
+    "#           \"address_2 = CASE address_2 WHEN '' THEN NULL ELSE address_2 END, \"\n",
+    "#           \"city = CASE city WHEN '' THEN NULL ELSE city END, \"\n",
+    "#           \"state = CASE state WHEN '' THEN NULL ELSE state END, \"\n",
+    "#           \"employer = CASE employer WHEN '' THEN NULL ELSE employer END, \" \n",
+    "#           \"occupation = CASE occupation WHEN '' THEN NULL ELSE occupation END, \" \n",
+    "#           \"zip = CASE zip WHEN '' THEN NULL ELSE zip END\")\n",
+    "\n",
+    "\n",
+    "# conn.commit()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<pyathena.cursor.Cursor at 0x7f2aaeeb8ef0>"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# c.execute(\"CREATE TABLE processed_donors AS \" \n",
+    "#           \"(SELECT donor_id, \" \n",
+    "#           \" LOWER(city) AS city, \" \n",
+    "#           \" CASE WHEN (first_name IS NULL AND last_name IS NULL) \"\n",
+    "#           \"      THEN NULL \"\n",
+    "#           \"      ELSE LOWER(CONCAT_WS(' ', first_name, last_name)) \"\n",
+    "#           \" END AS name, \" \n",
+    "#           \" LOWER(zip) AS zip, \" \n",
+    "#           \" LOWER(state) AS state, \" \n",
+    "#           \" CASE WHEN (address_1 IS NULL AND address_2 IS NULL) \"\n",
+    "#           \"      THEN NULL \"\n",
+    "#           \"      ELSE LOWER(CONCAT_WS(' ', address_1, address_2)) \"\n",
+    "#           \" END AS address, \" \n",
+    "#           \" LOWER(occupation) AS occupation, \"\n",
+    "#           \" LOWER(employer) AS employer, \"\n",
+    "#           \" ISNULL(first_name) AS person \"\n",
+    "#           \" FROM donors)\")\n",
+    "q = '''\n",
+    "CREATE TABLE ria_data_science_s3.processed_donors AS  \n",
+    "    SELECT donor_id,  \n",
+    "     LOWER(city) AS city,  \n",
+    "     CASE WHEN (first_name IS NULL AND last_name IS NULL) \n",
+    "          THEN NULL \n",
+    "          ELSE LOWER(CONCAT(first_name, ' ', last_name)) \n",
+    "     END AS name,  \n",
+    "     LOWER(zip) AS zip,  \n",
+    "     LOWER(state) AS state,  \n",
+    "     CASE WHEN (address_1 IS NULL AND address_2 IS NULL) \n",
+    "          THEN NULL \n",
+    "          ELSE LOWER(CONCAT(address_1, ' ', address_2)) \n",
+    "     END AS address,  \n",
+    "     LOWER(occupation) AS occupation, \n",
+    "     LOWER(employer) AS employer, \n",
+    "     first_name is null AS person \n",
+    " FROM ria_data_science_s3.donors'''\n",
+    "c.execute(q)\n",
+    "\n",
+    "\n",
+    "# c.execute(\"CREATE INDEX donor_idx ON processed_donors (donor_id)\")\n",
+    "\n",
+    "# c.close()\n",
+    "# conn.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('done')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(1762975, 29)"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.shape"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "conda_python3",
+   "language": "python",
+   "name": "conda_python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/presto_example/README.md b/presto_example/README.md
new file mode 100644
index 00000000..a027b3b0
--- /dev/null
+++ b/presto_example/README.md
@@ -0,0 +1,23 @@
+# MySQL Example
+
+Takes a database of IL campaign contribution data, loads it in to a
+MySQL database, and identifies the unique donors. This can take a few
+hours and will noticeably tax your laptop. You might want to run it
+overnight.
+
+To follow this example you need to 
+
+* Create a MySQL database called 'contributions'
+* Copy `mysql_example/mysql.cnf_LOCAL` to `mysql_example/mysql.cnf`
+* Update `mysql_example/mysql.cnf` with your MySQL username and password
+* Install dependencies, `pip install -r requirements.txt`
+
+Once that's all done you can run the example:
+
+```bash
+cd mysql_example
+python mysql_init_db.py 
+python mysql_example.py
+```
+
+  (use 'y', 'n' and 'u' keys to flag duplicates for active learning, 'f' when you are finished) 
diff --git a/presto_example/mysql.cnf_LOCAL b/presto_example/mysql.cnf_LOCAL
new file mode 100644
index 00000000..17bded3f
--- /dev/null
+++ b/presto_example/mysql.cnf_LOCAL
@@ -0,0 +1,4 @@
+[client]
+user = your_mysql_user
+password = your_mysql_password
+default-character-set=utf8
diff --git a/presto_example/mysql_example.py b/presto_example/mysql_example.py
new file mode 100644
index 00000000..5e257e13
--- /dev/null
+++ b/presto_example/mysql_example.py
@@ -0,0 +1,344 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+This is an example of working with very large data. There are about
+700,000 unduplicated donors in this database of Illinois political
+campaign contributions.
+
+With such a large set of input data, we cannot store all the comparisons
+we need to make in memory. Instead, we will read the pairs on demand
+from the MySQL database.
+
+__Note:__ You will need to run `python mysql_init_db.py`
+before running this script. See the annotates source for
+[mysql_init_db.py](mysql_init_db.html)
+
+For smaller datasets (<10,000), see our
+[csv_example](csv_example.html)
+"""
+
+import os
+import itertools
+import time
+import logging
+import optparse
+import locale
+import json
+
+import MySQLdb
+import MySQLdb.cursors
+
+import dedupe
+import dedupe.backport
+
+
+def record_pairs(result_set):
+    for i, row in enumerate(result_set):
+        a_record_id, a_record, b_record_id, b_record = row
+        record_a = (a_record_id, json.loads(a_record))
+        record_b = (b_record_id, json.loads(b_record))
+
+        yield record_a, record_b
+
+        if i % 10000 == 0:
+            print(i)
+
+
+def cluster_ids(clustered_dupes):
+
+    for cluster, scores in clustered_dupes:
+        cluster_id = cluster[0]
+        for donor_id, score in zip(cluster, scores):
+            yield donor_id, cluster_id, score
+
+
+if __name__ == '__main__':
+
+    # ## Logging
+
+    # Dedupe uses Python logging to show or suppress verbose output. Added
+    # for convenience.  To enable verbose output, run `python
+    # examples/mysql_example/mysql_example.py -v`
+    optp = optparse.OptionParser()
+    optp.add_option('-v', '--verbose', dest='verbose', action='count',
+                    help='Increase verbosity (specify multiple times for more)'
+                    )
+    (opts, args) = optp.parse_args()
+    log_level = logging.WARNING
+    if opts.verbose:
+        if opts.verbose == 1:
+            log_level = logging.INFO
+        elif opts.verbose >= 2:
+            log_level = logging.DEBUG
+    logging.getLogger().setLevel(log_level)
+
+    # ## Setup
+    MYSQL_CNF = os.path.abspath('.') + '/mysql.cnf'
+
+    settings_file = 'mysql_example_settings'
+    training_file = 'mysql_example_training.json'
+
+    start_time = time.time()
+
+    # You'll need to copy `examples/mysql_example/mysql.cnf_LOCAL` to
+    # `examples/mysql_example/mysql.cnf` and fill in your mysql database
+    # information in `examples/mysql_example/mysql.cnf`
+
+    # We use Server Side cursors (SSDictCursor and SSCursor) to [avoid
+    # having to have enormous result sets in
+    # memory](http://stackoverflow.com/questions/1808150/how-to-efficiently-use-mysqldb-sscursor).
+    read_con = MySQLdb.connect(db='contributions',
+                               charset='utf8',
+                               read_default_file=MYSQL_CNF,
+                               cursorclass=MySQLdb.cursors.SSDictCursor)
+
+    write_con = MySQLdb.connect(db='contributions',
+                                charset='utf8',
+                                read_default_file=MYSQL_CNF)
+
+    # We'll be using variations on this following select statement to pull
+    # in campaign donor info.
+    #
+    # We did a fair amount of preprocessing of the fields in
+    # `mysql_init_db.py`
+
+    DONOR_SELECT = "SELECT donor_id, city, name, zip, state, address " \
+                   "from processed_donors"
+
+    # ## Training
+
+    if os.path.exists(settings_file):
+        print('reading from ', settings_file)
+        with open(settings_file, 'rb') as sf:
+            deduper = dedupe.StaticDedupe(sf, num_cores=4)
+    else:
+        # Define the fields dedupe will pay attention to
+        #
+        # The address, city, and zip fields are often missing, so we'll
+        # tell dedupe that, and we'll learn a model that take that into
+        # account
+        fields = [{'field': 'name', 'type': 'String'},
+                  {'field': 'address', 'type': 'String',
+                   'has missing': True},
+                  {'field': 'city', 'type': 'ShortString', 'has missing': True},
+                  {'field': 'state', 'type': 'ShortString', 'has missing': True},
+                  {'field': 'zip', 'type': 'ShortString', 'has missing': True},
+                  ]
+
+        # Create a new deduper object and pass our data model to it.
+        deduper = dedupe.Dedupe(fields, num_cores=4)
+
+        # We will sample pairs from the entire donor table for training
+        with read_con.cursor() as cur:
+            cur.execute(DONOR_SELECT)
+            temp_d = {i: row for i, row in enumerate(cur)}
+
+        # If we have training data saved from a previous run of dedupe,
+        # look for it an load it in.
+        #
+        # __Note:__ if you want to train from
+        # scratch, delete the training_file
+        if os.path.exists(training_file):
+            print('reading labeled examples from ', training_file)
+            with open(training_file) as tf:
+                deduper.prepare_training(temp_d, training_file=tf)
+        else:
+            deduper.prepare_training(temp_d)
+
+        del temp_d
+
+        # ## Active learning
+
+        print('starting active labeling...')
+        # Starts the training loop. Dedupe will find the next pair of records
+        # it is least certain about and ask you to label them as duplicates
+        # or not.
+
+        # use 'y', 'n' and 'u' keys to flag duplicates
+        # press 'f' when you are finished
+        dedupe.convenience.console_label(deduper)
+        # When finished, save our labeled, training pairs to disk
+        with open(training_file, 'w') as tf:
+            deduper.write_training(tf)
+
+        # Notice our the argument here
+        #
+        # `recall` is the proportion of true dupes pairs that the learned
+        # rules must cover. You may want to reduce this if your are making
+        # too many blocks and too many comparisons.
+        deduper.train(recall=0.90)
+
+        with open(settings_file, 'wb') as sf:
+            deduper.write_settings(sf)
+
+        # We can now remove some of the memory hobbing objects we used
+        # for training
+        deduper.cleanup_training()
+
+    # ## Blocking
+
+    print('blocking...')
+
+    # To run blocking on such a large set of data, we create a separate table
+    # that contains blocking keys and record ids
+    print('creating blocking_map database')
+    with write_con.cursor() as cur:
+        cur.execute("DROP TABLE IF EXISTS blocking_map")
+        cur.execute("CREATE TABLE blocking_map "
+                    "(block_key VARCHAR(200), donor_id INTEGER) "
+                    "CHARACTER SET utf8 COLLATE utf8_unicode_ci")
+
+    write_con.commit()
+
+    # If dedupe learned a Index Predicate, we have to take a pass
+    # through the data and create indices.
+    print('creating inverted index')
+
+    for field in deduper.fingerprinter.index_fields:
+        with read_con.cursor() as cur:
+            cur.execute("SELECT DISTINCT {field} FROM processed_donors "
+                        "WHERE {field} IS NOT NULL".format(field=field))
+            field_data = (row[0] for row in cur)
+            deduper.fingerprinter.index(field_data, field)
+
+    # Now we are ready to write our blocking map table by creating a
+    # generator that yields unique `(block_key, donor_id)` tuples.
+    print('writing blocking map')
+
+    with read_con.cursor() as read_cur:
+        read_cur.execute(DONOR_SELECT)
+        full_data = ((row['donor_id'], row) for row in read_cur)
+        b_data = deduper.fingerprinter(full_data)
+
+        with write_con.cursor() as write_cur:
+
+            write_cur.executemany("INSERT INTO blocking_map VALUES (%s, %s)",
+                                  b_data)
+
+    write_con.commit()
+
+    # Free up memory by removing indices we don't need anymore
+    deduper.fingerprinter.reset_indices()
+
+    # indexing blocking_map
+    print('creating index')
+    with write_con.cursor() as cur:
+        cur.execute("CREATE UNIQUE INDEX bm_idx ON blocking_map (block_key, donor_id)")
+
+    write_con.commit()
+    read_con.commit()
+
+    # select unique pairs to compare
+    with read_con.cursor(MySQLdb.cursors.SSCursor) as read_cur:
+
+        read_cur.execute("""
+               select a.donor_id,
+                      json_object('city', a.city,
+                                  'name', a.name,
+                                  'zip', a.zip,
+                                  'state', a.state,
+                                  'address', a.address),
+                      b.donor_id,
+                      json_object('city', b.city,
+                                  'name', b.name,
+                                  'zip', b.zip,
+                                  'state', b.state,
+                                  'address', b.address)
+               from (select DISTINCT l.donor_id as east, r.donor_id as west
+                     from blocking_map as l
+                     INNER JOIN blocking_map as r
+                     using (block_key)
+                     where l.donor_id < r.donor_id) ids
+               INNER JOIN processed_donors a on ids.east=a.donor_id
+               INNER JOIN processed_donors b on ids.west=b.donor_id
+               """)
+
+        # ## Clustering
+
+        print('clustering...')
+        clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)),
+                                          threshold=0.5)
+
+        with write_con.cursor() as write_cur:
+
+            # ## Writing out results
+
+            # We now have a sequence of tuples of donor ids that dedupe believes
+            # all refer to the same entity. We write this out onto an entity map
+            # table
+            write_cur.execute("DROP TABLE IF EXISTS entity_map")
+
+            print('creating entity_map database')
+            write_cur.execute("CREATE TABLE entity_map "
+                              "(donor_id INTEGER, canon_id INTEGER, "
+                              " cluster_score FLOAT, PRIMARY KEY(donor_id))")
+
+            write_cur.executemany('INSERT INTO entity_map VALUES (%s, %s, %s)',
+                                  cluster_ids(clustered_dupes))
+
+    write_con.commit()
+
+    with write_con.cursor() as cur:
+        cur.execute("CREATE INDEX head_index ON entity_map (canon_id)")
+
+    write_con.commit()
+    read_con.commit()
+
+    # Print out the number of duplicates found
+    print('# duplicate sets')
+
+    # ## Payoff
+
+    # With all this done, we can now begin to ask interesting questions
+    # of the data
+    #
+    # For example, let's see who the top 10 donors are.
+
+    locale.setlocale(locale.LC_ALL, '')  # for pretty printing numbers
+
+    with read_con.cursor() as cur:
+        # Create a temporary table so each group and unmatched record has
+        # a unique id
+        cur.execute("CREATE TEMPORARY TABLE e_map "
+                    "SELECT IFNULL(canon_id, donor_id) AS canon_id, donor_id "
+                    "FROM entity_map "
+                    "RIGHT JOIN donors USING(donor_id)")
+
+        cur.execute("SELECT CONCAT_WS(' ', donors.first_name, donors.last_name) AS name, "
+                    "donation_totals.totals AS totals "
+                    "FROM donors INNER JOIN "
+                    "(SELECT canon_id, SUM(amount) AS totals "
+                    " FROM contributions INNER JOIN e_map "
+                    " USING (donor_id) "
+                    " GROUP BY (canon_id) "
+                    " ORDER BY totals "
+                    " DESC LIMIT 10) "
+                    "AS donation_totals "
+                    "WHERE donors.donor_id = donation_totals.canon_id")
+
+        print("Top Donors (deduped)")
+        for row in cur:
+            row['totals'] = locale.currency(row['totals'], grouping=True)
+            print('%(totals)20s: %(name)s' % row)
+
+        # Compare this to what we would have gotten if we hadn't done any
+        # deduplication
+        cur.execute("SELECT CONCAT_WS(' ', donors.first_name, donors.last_name) as name, "
+                    "SUM(contributions.amount) AS totals "
+                    "FROM donors INNER JOIN contributions "
+                    "USING (donor_id) "
+                    "GROUP BY (donor_id) "
+                    "ORDER BY totals DESC "
+                    "LIMIT 10")
+
+        print("Top Donors (raw)")
+        for row in cur:
+            row['totals'] = locale.currency(row['totals'], grouping=True)
+            print('%(totals)20s: %(name)s' % row)
+
+        # Close our database connection
+    read_con.close()
+    write_con.close()
+
+    print('ran in', time.time() - start_time, 'seconds')
diff --git a/presto_example/mysql_init_db.py b/presto_example/mysql_init_db.py
new file mode 100644
index 00000000..fcdc1256
--- /dev/null
+++ b/presto_example/mysql_init_db.py
@@ -0,0 +1,234 @@
+#!/usr/bin/python
+"""
+This is a setup script for mysql_example.  It downloads a zip file of
+Illinois campaign contributions and loads them into a MySQL database
+named 'contributions'.
+ 
+__Note:__ You will need to run this script first before execuing
+[mysql_example.py](mysql_example.html).
+ 
+Tables created:
+* raw_table - raw import of entire CSV file
+* donors - all distinct donors based on name and address
+* recipients - all distinct campaign contribution recipients
+* contributions - contribution amounts tied to donor and recipients tables
+"""
+
+import os
+import zipfile
+import warnings
+
+from urllib.request import urlopen
+
+import MySQLdb
+
+warnings.filterwarnings('ignore', category=MySQLdb.Warning)
+
+contributions_zip_file = 'Illinois-campaign-contributions.txt.zip'
+contributions_txt_file = 'Illinois-campaign-contributions.txt'
+
+if not os.path.exists(contributions_zip_file) :
+    print('downloading', contributions_zip_file, '(~60mb) ...')
+    u = urlopen('https://s3.amazonaws.com/dedupe-data/Illinois-campaign-contributions.txt.zip')
+    localFile = open(contributions_zip_file, 'wb')
+    localFile.write(u.read())
+    localFile.close()
+
+if not os.path.exists(contributions_txt_file) :
+    zip_file = zipfile.ZipFile(contributions_zip_file, 'r')
+    print('extracting %s' % contributions_zip_file)
+    zip_file_contents = zip_file.namelist()
+    for f in zip_file_contents:
+        if ('.txt' in f):
+            zip_file.extract(f)
+    zip_file.close()
+
+conn = MySQLdb.connect(read_default_file = os.path.abspath('.') + '/mysql.cnf', 
+                       local_infile = 1,
+                       sql_mode="ALLOW_INVALID_DATES",
+                       db='contributions')
+c = conn.cursor()
+
+print('importing raw data from csv...')
+c.execute("DROP TABLE IF EXISTS raw_table")
+c.execute("DROP TABLE IF EXISTS donors")
+c.execute("DROP TABLE IF EXISTS recipients")
+c.execute("DROP TABLE IF EXISTS contributions")
+c.execute("DROP TABLE IF EXISTS processed_donors")
+
+c.execute("CREATE TABLE raw_table "
+          "(reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), "
+          " address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), "
+          " state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), "
+          " date_recieved VARCHAR(10), loan_amount VARCHAR(12), "
+          " amount VARCHAR(23), receipt_type VARCHAR(23), "
+          " employer VARCHAR(70), occupation VARCHAR(40), "
+          " vendor_last_name VARCHAR(70), vendor_first_name VARCHAR(20), "
+          " vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), "
+          " vendor_city VARCHAR(20), vendor_state VARCHAR(10), "
+          " vendor_zip VARCHAR(10), description VARCHAR(90), "
+          " election_type VARCHAR(10), election_year VARCHAR(10), "
+          " report_period_begin VARCHAR(10), report_period_end VARCHAR(33), "
+          " committee_name VARCHAR(70), committee_id VARCHAR(37)) "
+          "CHARACTER SET utf8 COLLATE utf8_unicode_ci")
+
+
+conn.commit()
+
+c.execute("LOAD DATA LOCAL INFILE %s INTO TABLE raw_table "
+          "FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\r\n' " 
+          "IGNORE 1 LINES "
+          "(reciept_id, last_name, first_name, "
+          " address_1, address_2, city, state, "
+          " zip, report_type, date_recieved, "
+          " loan_amount, amount, receipt_type, "
+          " employer, occupation, vendor_last_name, "
+          " vendor_first_name, vendor_address_1, "
+          " vendor_address_2, vendor_city, vendor_state, "
+          " vendor_zip, description, election_type, "
+          " election_year, "
+          " report_period_begin, report_period_end, "
+          " committee_name, committee_id, @dummy)",
+          (contributions_txt_file,))
+
+# Remove the very few records that mess up the demo 
+# (demo purposes only! Don't do something like this in production)
+c.execute("DELETE FROM raw_table WHERE LENGTH(date_recieved) < 10")
+
+# set empty, non-zero, strings in date columns to null
+c.execute("UPDATE raw_table SET report_period_begin = NULL WHERE LENGTH(report_period_begin) < 10")
+c.execute("UPDATE raw_table SET report_period_end = NULL WHERE LENGTH(report_period_end) < 10")
+
+#committee ID is requred. Remove the 2 rows that don't have it.
+c.execute("DELETE FROM raw_table WHERE committee_id=''");
+
+# There's a record with a date stuck in the committee_id column, which causes
+# problems when inserting into the contributions table below. Get rid of it this 
+# way.
+c.execute("DELETE FROM raw_table WHERE LENGTH( committee_id ) > 9")
+conn.commit()
+
+
+
+print('creating donors table...')
+c.execute("CREATE TABLE donors "
+          "(donor_id INTEGER PRIMARY KEY AUTO_INCREMENT, "
+          " last_name VARCHAR(70), first_name VARCHAR(35), "
+          " address_1 VARCHAR(35), address_2 VARCHAR(36), "
+          " city VARCHAR(20), state VARCHAR(15), "
+          " zip VARCHAR(11), employer VARCHAR(70), "
+          " occupation VARCHAR(40)) "
+          "CHARACTER SET utf8 COLLATE utf8_unicode_ci")
+c.execute("INSERT INTO donors "
+          "(first_name, last_name, address_1,"
+          " address_2, city, state, zip, employer, occupation) "
+          "SELECT DISTINCT "
+          "TRIM(first_name), TRIM(last_name), TRIM(address_1),  "
+          "TRIM(address_2), TRIM(city), TRIM(state), TRIM(zip), "
+          "TRIM(employer), TRIM(occupation) "
+          "FROM raw_table")
+conn.commit()
+
+
+print('creating indexes on donors table')
+c.execute("CREATE INDEX donors_donor_info ON donors "
+          "(last_name, first_name, address_1, address_2, city, "
+          " state, zip)")
+conn.commit()
+
+
+
+print('creating recipients table...')
+c.execute("CREATE TABLE recipients "
+          "(recipient_id INTEGER PRIMARY KEY AUTO_INCREMENT, name VARCHAR(70)) "
+          "CHARACTER SET utf8 COLLATE utf8_unicode_ci")
+
+c.execute("INSERT IGNORE INTO recipients "
+          "SELECT DISTINCT committee_id, committee_name FROM raw_table")
+conn.commit()
+
+print('creating contributions table')
+c.execute("CREATE TABLE contributions "
+          "(contribution_id INT, donor_id INT, recipient_id INT, "
+          " report_type VARCHAR(24), date_recieved DATE, "
+          " loan_amount VARCHAR(12), amount VARCHAR(23), "
+          " receipt_type VARCHAR(23), "
+          " vendor_last_name VARCHAR(70), "
+          " vendor_first_name VARCHAR(20), "
+          " vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), "
+          " vendor_city VARCHAR(20), vendor_state VARCHAR(10), "
+          " vendor_zip VARCHAR(10), description VARCHAR(90), "
+          " election_type VARCHAR(10), election_year VARCHAR(10), "
+          " report_period_begin DATE, report_period_end DATE) "
+          "CHARACTER SET utf8 COLLATE utf8_unicode_ci")
+
+
+c.execute("INSERT INTO contributions "
+          "SELECT reciept_id, donors.donor_id, committee_id, "
+          " report_type, STR_TO_DATE(date_recieved, '%m/%d/%Y'), "
+          " loan_amount, amount, "
+          " receipt_type, vendor_last_name , "
+          " vendor_first_name, vendor_address_1, vendor_address_2, "
+          " vendor_city, vendor_state, vendor_zip, description, "
+          " election_type, election_year, "
+          " STR_TO_DATE(report_period_begin, '%m/%d/%Y'), "
+          " STR_TO_DATE(report_period_end, '%m/%d/%Y') "
+          "FROM raw_table JOIN donors ON "
+          "donors.first_name = TRIM(raw_table.first_name) AND "
+          "donors.last_name = TRIM(raw_table.last_name) AND "
+          "donors.address_1 = TRIM(raw_table.address_1) AND "
+          "donors.address_2 = TRIM(raw_table.address_2) AND "
+          "donors.city = TRIM(raw_table.city) AND "
+          "donors.state = TRIM(raw_table.state) AND "
+          "donors.employer = TRIM(raw_table.employer) AND "
+          "donors.occupation = TRIM(raw_table.occupation) AND "
+          "donors.zip = TRIM(raw_table.zip)")
+conn.commit()
+
+print('creating indexes on contributions')
+c.execute("ALTER TABLE contributions ADD PRIMARY KEY(contribution_id)")
+c.execute("CREATE INDEX donor_idx ON contributions (donor_id)")
+c.execute("CREATE INDEX recipient_idx ON contributions (recipient_id)")
+
+
+conn.commit()
+
+print('nullifying empty strings in donors')
+c.execute("UPDATE donors "
+          "SET "
+          "first_name = CASE first_name WHEN '' THEN NULL ELSE first_name END, "
+          "last_name = CASE last_name WHEN '' THEN NULL ELSE last_name END, "
+          "address_1 = CASE address_1 WHEN '' THEN NULL ELSE address_1 END, "
+          "address_2 = CASE address_2 WHEN '' THEN NULL ELSE address_2 END, "
+          "city = CASE city WHEN '' THEN NULL ELSE city END, "
+          "state = CASE state WHEN '' THEN NULL ELSE state END, "
+          "employer = CASE employer WHEN '' THEN NULL ELSE employer END, " 
+          "occupation = CASE occupation WHEN '' THEN NULL ELSE occupation END, " 
+          "zip = CASE zip WHEN '' THEN NULL ELSE zip END")
+
+
+conn.commit()
+
+c.execute("CREATE TABLE processed_donors AS " 
+          "(SELECT donor_id, " 
+          " LOWER(city) AS city, " 
+          " CASE WHEN (first_name IS NULL AND last_name IS NULL) "
+          "      THEN NULL "
+          "      ELSE LOWER(CONCAT_WS(' ', first_name, last_name)) "
+          " END AS name, " 
+          " LOWER(zip) AS zip, " 
+          " LOWER(state) AS state, " 
+          " CASE WHEN (address_1 IS NULL AND address_2 IS NULL) "
+          "      THEN NULL "
+          "      ELSE LOWER(CONCAT_WS(' ', address_1, address_2)) "
+          " END AS address, " 
+          " LOWER(occupation) AS occupation, "
+          " LOWER(employer) AS employer, "
+          " ISNULL(first_name) AS person "
+          " FROM donors)")
+ 
+c.execute("CREATE INDEX donor_idx ON processed_donors (donor_id)")
+
+c.close()
+conn.close()
+print('done')
diff --git a/presto_example/requirements.txt b/presto_example/requirements.txt
new file mode 100644
index 00000000..18c098ae
--- /dev/null
+++ b/presto_example/requirements.txt
@@ -0,0 +1 @@
+mysqlclient

From 881d0726f14dfe831621a7db3331c944b68a924a Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-10-10-18-24.eu-west-1.compute.internal>
Date: Fri, 19 Jun 2020 21:50:18 +0000
Subject: [PATCH 02/19] starting the example

---
 notebooks/mysql_example.ipynb | 595 ++++++++++++++++++++++++++++++++++
 notebooks/mysql_init_db.ipynb | 173 ++--------
 2 files changed, 615 insertions(+), 153 deletions(-)
 create mode 100644 notebooks/mysql_example.ipynb

diff --git a/notebooks/mysql_example.ipynb b/notebooks/mysql_example.ipynb
new file mode 100644
index 00000000..d4ba20ac
--- /dev/null
+++ b/notebooks/mysql_example.ipynb
@@ -0,0 +1,595 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "athena_garbage = 's3://com.ria.scratch/athena_garbage/'\n",
+    "bucket='com.ria.scratch'\n",
+    "region='eu-west-1'\n",
+    "workgroup = 'RIA'\n",
+    "root_key='as-dedupe/'\n",
+    "import sys\n",
+    "sys.path.insert(0, '../../dedupe/')\n",
+    "import mydedupe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %load ../mysql_example/mysql_example.py\n",
+    "#!/usr/bin/python\n",
+    "\n",
+    "\"\"\"\n",
+    "This is an example of working with very large data. There are about\n",
+    "700,000 unduplicated donors in this database of Illinois political\n",
+    "campaign contributions.\n",
+    "\n",
+    "With such a large set of input data, we cannot store all the comparisons\n",
+    "we need to make in memory. Instead, we will read the pairs on demand\n",
+    "from the MySQL database.\n",
+    "\n",
+    "__Note:__ You will need to run `python mysql_init_db.py`\n",
+    "before running this script. See the annotates source for\n",
+    "[mysql_init_db.py](mysql_init_db.html)\n",
+    "\n",
+    "For smaller datasets (<10,000), see our\n",
+    "[csv_example](csv_example.html)\n",
+    "\"\"\"\n",
+    "\n",
+    "import os\n",
+    "import itertools\n",
+    "import time\n",
+    "import logging\n",
+    "import optparse\n",
+    "import locale\n",
+    "import json\n",
+    "import pandas as pd\n",
+    "\n",
+    "# import MySQLdb\n",
+    "# import MySQLdb.cursors\n",
+    "\n",
+    "import dedupe\n",
+    "import dedupe.backport\n",
+    "import boto3\n",
+    "from pyathena import connect\n",
+    "from pyathena.pandas_cursor import PandasCursor\n",
+    "\n",
+    "def record_pairs(result_set):\n",
+    "    for i, row in enumerate(result_set):\n",
+    "        a_record_id, a_record, b_record_id, b_record = row\n",
+    "        record_a = (a_record_id, json.loads(a_record))\n",
+    "        record_b = (b_record_id, json.loads(b_record))\n",
+    "\n",
+    "        yield record_a, record_b\n",
+    "\n",
+    "        if i % 10000 == 0:\n",
+    "            print(i)\n",
+    "\n",
+    "\n",
+    "def cluster_ids(clustered_dupes):\n",
+    "\n",
+    "    for cluster, scores in clustered_dupes:\n",
+    "        cluster_id = cluster[0]\n",
+    "        for donor_id, score in zip(cluster, scores):\n",
+    "            yield donor_id, cluster_id, score\n",
+    "\n",
+    "\n",
+    "if __name__ == '__main__':\n",
+    "\n",
+    "    # ## Logging\n",
+    "\n",
+    "    # Dedupe uses Python logging to show or suppress verbose output. Added\n",
+    "    # for convenience.  To enable verbose output, run `python\n",
+    "    # examples/mysql_example/mysql_example.py -v`\n",
+    "    \n",
+    "#     optp = optparse.OptionParser()\n",
+    "#     optp.add_option('-v', '--verbose', dest='verbose', action='count',\n",
+    "#                     help='Increase verbosity (specify multiple times for more)'\n",
+    "#                     )\n",
+    "#     (opts, args) = optp.parse_args()\n",
+    "#     log_level = logging.WARNING\n",
+    "#     if opts.verbose:\n",
+    "#         if opts.verbose == 1:\n",
+    "#             log_level = logging.INFO\n",
+    "#         elif opts.verbose >= 2:\n",
+    "#             log_level = logging.DEBUG\n",
+    "\n",
+    "## Armin\n",
+    "    log_level = logging.WARNING\n",
+    "#######\n",
+    "\n",
+    "    logging.getLogger().setLevel(log_level)\n",
+    "\n",
+    "    \n",
+    "\n",
+    "#     # ## Setup\n",
+    "#     MYSQL_CNF = os.path.abspath('.') + '/mysql.cnf'\n",
+    "\n",
+    "    settings_file = 'mysql_example_settings'\n",
+    "    training_file = 'mysql_example_training.json'\n",
+    "\n",
+    "    start_time = time.time()\n",
+    "\n",
+    "    # You'll need to copy `examples/mysql_example/mysql.cnf_LOCAL` to\n",
+    "    # `examples/mysql_example/mysql.cnf` and fill in your mysql database\n",
+    "    # information in `examples/mysql_example/mysql.cnf`\n",
+    "\n",
+    "    # We use Server Side cursors (SSDictCursor and SSCursor) to [avoid\n",
+    "    # having to have enormous result sets in\n",
+    "    # memory](http://stackoverflow.com/questions/1808150/how-to-efficiently-use-mysqldb-sscursor).\n",
+    "#     read_con = MySQLdb.connect(db='contributions',\n",
+    "#                                charset='utf8',\n",
+    "#                                read_default_file=MYSQL_CNF,\n",
+    "#                                cursorclass=MySQLdb.cursors.SSDictCursor)\n",
+    "\n",
+    "#     write_con = MySQLdb.connect(db='contributions',\n",
+    "#                                 charset='utf8',\n",
+    "#                                 read_default_file=MYSQL_CNF)\n",
+    "\n",
+    "s3 = boto3.client('s3')  \n",
+    "conn = connect(s3_staging_dir=athena_garbage,\n",
+    "                 region_name=region, work_group=workgroup)\n",
+    "cur = conn.cursor(PandasCursor)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "    # We'll be using variations on this following select statement to pull\n",
+    "    # in campaign donor info.\n",
+    "    #\n",
+    "    # We did a fair amount of preprocessing of the fields in\n",
+    "    # `mysql_init_db.py`    \n",
+    "    DONOR_SELECT = \"SELECT donor_id, city, name, zip, state, address \" \\\n",
+    "                   \"from ria_data_science_s3.processed_donors limit 1000\"\n",
+    "\n",
+    "    # ## Training\n",
+    "\n",
+    "    if os.path.exists(settings_file):\n",
+    "        print('reading from ', settings_file)\n",
+    "        with open(settings_file, 'rb') as sf:\n",
+    "            deduper = dedupe.StaticDedupe(sf, num_cores=4)\n",
+    "    else:\n",
+    "        # Define the fields dedupe will pay attention to\n",
+    "        #\n",
+    "        # The address, city, and zip fields are often missing, so we'll\n",
+    "        # tell dedupe that, and we'll learn a model that take that into\n",
+    "        # account\n",
+    "        fields = [{'field': 'name', 'type': 'String'},\n",
+    "                  {'field': 'address', 'type': 'String',\n",
+    "                   'has missing': True},\n",
+    "                  {'field': 'city', 'type': 'ShortString', 'has missing': True},\n",
+    "                  {'field': 'state', 'type': 'ShortString', 'has missing': True},\n",
+    "                  {'field': 'zip', 'type': 'ShortString', 'has missing': True},\n",
+    "                  ]\n",
+    "\n",
+    "        # Create a new deduper object and pass our data model to it.\n",
+    "        deduper = mydedupe.Dedupe(fields, num_cores=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "        # We will sample pairs from the entire donor table for training\n",
+    "#         with read_con.cursor() as cur:\n",
+    "#         cur.execute(DONOR_SELECT)\n",
+    "#         temp_d = {i: row for i, row in enumerate(cur)}\n",
+    "\n",
+    "        #Armin: Very Suspicious, does the ssdictcursor convers everything to string?\n",
+    "        df = cur.execute(DONOR_SELECT).as_pandas()#.astype(str)\n",
+    "        temp_d = df.where(pd.notnull(df), None).to_dict('index')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "name : None\n",
+      "address : None\n",
+      "city : st. louis\n",
+      "state : mo\n",
+      "zip : 63118\n",
+      "\n",
+      "name : None\n",
+      "address : None\n",
+      "city : None\n",
+      "state : il\n",
+      "zip : None\n",
+      "\n",
+      "0/10 positive, 0/10 negative\n",
+      "Do these records refer to the same thing?\n",
+      "(y)es / (n)o / (u)nsure / (f)inished\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "starting active labeling...\n",
+      "y\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "name : nick manousopoulos\n",
+      "address : None\n",
+      "city : calumet city\n",
+      "state : il\n",
+      "zip : 60409\n",
+      "\n",
+      "name : None\n",
+      "address : None\n",
+      "city : None\n",
+      "state : il\n",
+      "zip : None\n",
+      "\n",
+      "1/10 positive, 0/10 negative\n",
+      "Do these records refer to the same thing?\n",
+      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "y\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "name : gary h. smith\n",
+      "address : 205 w. wacker drive suite 510\n",
+      "city : chicago\n",
+      "state : il\n",
+      "zip : 60606\n",
+      "\n",
+      "name : None\n",
+      "address : None\n",
+      "city : chicago\n",
+      "state : il\n",
+      "zip : 60606\n",
+      "\n",
+      "2/10 positive, 0/10 negative\n",
+      "Do these records refer to the same thing?\n",
+      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "y\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "name : sam vinson\n",
+      "address : None\n",
+      "city : chicago\n",
+      "state : il\n",
+      "zip : 60602\n",
+      "\n",
+      "name : john dore\n",
+      "address : 134 n. lasalle #1508\n",
+      "city : chicago\n",
+      "state : il\n",
+      "zip : 60602\n",
+      "\n",
+      "3/10 positive, 0/10 negative\n",
+      "Do these records refer to the same thing?\n",
+      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "f\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Finished labeling\n"
+     ]
+    },
+    {
+     "ename": "TypeError",
+     "evalue": "79578 is not JSON serializable",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-9-5b994906c6be>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     25\u001b[0m \u001b[0;31m# When finished, save our labeled, training pairs to disk\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     26\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtraining_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'w'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 27\u001b[0;31m     \u001b[0mdeduper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite_training\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/api.py\u001b[0m in \u001b[0;36mwrite_training\u001b[0;34m(self, file_obj)\u001b[0m\n\u001b[1;32m   1059\u001b[0m                   \u001b[0mfile_obj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1060\u001b[0m                   \u001b[0mdefault\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mserializer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_to_json\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1061\u001b[0;31m                   ensure_ascii=True)\n\u001b[0m\u001b[1;32m   1062\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1063\u001b[0m     def write_settings(self,\n",
+      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/__init__.py\u001b[0m in \u001b[0;36mdump\u001b[0;34m(obj, fp, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)\u001b[0m\n\u001b[1;32m    177\u001b[0m     \u001b[0;31m# could accelerate with writelines in some versions of Python, at\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    178\u001b[0m     \u001b[0;31m# a debuggability cost\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 179\u001b[0;31m     \u001b[0;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0miterable\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    180\u001b[0m         \u001b[0mfp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    181\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode\u001b[0;34m(o, _current_indent_level)\u001b[0m\n\u001b[1;32m    428\u001b[0m             \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0m_iterencode_list\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    429\u001b[0m         \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 430\u001b[0;31m             \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0m_iterencode_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    431\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    432\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mmarkers\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode_dict\u001b[0;34m(dct, _current_indent_level)\u001b[0m\n\u001b[1;32m    402\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    403\u001b[0m                     \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_iterencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 404\u001b[0;31m                 \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    405\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mnewline_indent\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    406\u001b[0m             \u001b[0m_current_indent_level\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode_list\u001b[0;34m(lst, _current_indent_level)\u001b[0m\n\u001b[1;32m    323\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    324\u001b[0m                     \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_iterencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 325\u001b[0;31m                 \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    326\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mnewline_indent\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    327\u001b[0m             \u001b[0m_current_indent_level\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode_list\u001b[0;34m(lst, _current_indent_level)\u001b[0m\n\u001b[1;32m    323\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    324\u001b[0m                     \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_iterencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 325\u001b[0;31m                 \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    326\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mnewline_indent\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    327\u001b[0m             \u001b[0m_current_indent_level\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode_dict\u001b[0;34m(dct, _current_indent_level)\u001b[0m\n\u001b[1;32m    402\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    403\u001b[0m                     \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_iterencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 404\u001b[0;31m                 \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    405\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mnewline_indent\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    406\u001b[0m             \u001b[0m_current_indent_level\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode\u001b[0;34m(o, _current_indent_level)\u001b[0m\n\u001b[1;32m    435\u001b[0m                     \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Circular reference detected\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    436\u001b[0m                 \u001b[0mmarkers\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mmarkerid\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mo\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 437\u001b[0;31m             \u001b[0mo\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_default\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    438\u001b[0m             \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0m_iterencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    439\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mmarkers\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/serializer.py\u001b[0m in \u001b[0;36m_to_json\u001b[0;34m(python_object)\u001b[0m\n\u001b[1;32m     19\u001b[0m                          '__value__': list(python_object)}\n\u001b[1;32m     20\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 21\u001b[0;31m         \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrepr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpython_object\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m' is not JSON serializable'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     22\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     23\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mpython_object\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mTypeError\u001b[0m: 79578 is not JSON serializable"
+     ]
+    }
+   ],
+   "source": [
+    "        # If we have training data saved from a previous run of dedupe,\n",
+    "        # look for it an load it in.\n",
+    "        #\n",
+    "        # __Note:__ if you want to train from\n",
+    "        # scratch, delete the training_file\n",
+    "        if os.path.exists(training_file):\n",
+    "            print('reading labeled examples from ', training_file)\n",
+    "            with open(training_file) as tf:\n",
+    "                deduper.prepare_training(temp_d, training_file=tf)\n",
+    "        else:\n",
+    "            deduper.prepare_training(temp_d)\n",
+    "\n",
+    "        del temp_d\n",
+    "\n",
+    "        # ## Active learning\n",
+    "\n",
+    "        print('starting active labeling...')\n",
+    "        # Starts the training loop. Dedupe will find the next pair of records\n",
+    "        # it is least certain about and ask you to label them as duplicates\n",
+    "        # or not.\n",
+    "\n",
+    "        # use 'y', 'n' and 'u' keys to flag duplicates\n",
+    "        # press 'f' when you are finished\n",
+    "        dedupe.convenience.console_label(deduper)\n",
+    "        # When finished, save our labeled, training pairs to disk\n",
+    "        with open(training_file, 'w') as tf:\n",
+    "            deduper.write_training(tf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "        # Notice our the argument here\n",
+    "        #\n",
+    "        # `recall` is the proportion of true dupes pairs that the learned\n",
+    "        # rules must cover. You may want to reduce this if your are making\n",
+    "        # too many blocks and too many comparisons.\n",
+    "        deduper.train(recall=0.90)\n",
+    "\n",
+    "        with open(settings_file, 'wb') as sf:\n",
+    "            deduper.write_settings(sf)\n",
+    "\n",
+    "        # We can now remove some of the memory hobbing objects we used\n",
+    "        # for training\n",
+    "        deduper.cleanup_training()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "    # ## Blocking\n",
+    "\n",
+    "    print('blocking...')\n",
+    "\n",
+    "    # To run blocking on such a large set of data, we create a separate table\n",
+    "    # that contains blocking keys and record ids\n",
+    "    print('creating blocking_map database')\n",
+    "    with write_con.cursor() as cur:\n",
+    "        cur.execute(\"DROP TABLE IF EXISTS blocking_map\")\n",
+    "        cur.execute(\"CREATE TABLE blocking_map \"\n",
+    "                    \"(block_key VARCHAR(200), donor_id INTEGER) \"\n",
+    "                    \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n",
+    "\n",
+    "    write_con.commit()\n",
+    "\n",
+    "    # If dedupe learned a Index Predicate, we have to take a pass\n",
+    "    # through the data and create indices.\n",
+    "    print('creating inverted index')\n",
+    "\n",
+    "    for field in deduper.fingerprinter.index_fields:\n",
+    "        with read_con.cursor() as cur:\n",
+    "            cur.execute(\"SELECT DISTINCT {field} FROM processed_donors \"\n",
+    "                        \"WHERE {field} IS NOT NULL\".format(field=field))\n",
+    "            field_data = (row[0] for row in cur)\n",
+    "            deduper.fingerprinter.index(field_data, field)\n",
+    "\n",
+    "    # Now we are ready to write our blocking map table by creating a\n",
+    "    # generator that yields unique `(block_key, donor_id)` tuples.\n",
+    "    print('writing blocking map')\n",
+    "\n",
+    "    with read_con.cursor() as read_cur:\n",
+    "        read_cur.execute(DONOR_SELECT)\n",
+    "        full_data = ((row['donor_id'], row) for row in read_cur)\n",
+    "        b_data = deduper.fingerprinter(full_data)\n",
+    "\n",
+    "        with write_con.cursor() as write_cur:\n",
+    "\n",
+    "            write_cur.executemany(\"INSERT INTO blocking_map VALUES (%s, %s)\",\n",
+    "                                  b_data)\n",
+    "\n",
+    "    write_con.commit()\n",
+    "\n",
+    "    # Free up memory by removing indices we don't need anymore\n",
+    "    deduper.fingerprinter.reset_indices()\n",
+    "\n",
+    "    # indexing blocking_map\n",
+    "    print('creating index')\n",
+    "    with write_con.cursor() as cur:\n",
+    "        cur.execute(\"CREATE UNIQUE INDEX bm_idx ON blocking_map (block_key, donor_id)\")\n",
+    "\n",
+    "    write_con.commit()\n",
+    "    read_con.commit()\n",
+    "\n",
+    "    # select unique pairs to compare\n",
+    "    with read_con.cursor(MySQLdb.cursors.SSCursor) as read_cur:\n",
+    "\n",
+    "        read_cur.execute(\"\"\"\n",
+    "               select a.donor_id,\n",
+    "                      json_object('city', a.city,\n",
+    "                                  'name', a.name,\n",
+    "                                  'zip', a.zip,\n",
+    "                                  'state', a.state,\n",
+    "                                  'address', a.address),\n",
+    "                      b.donor_id,\n",
+    "                      json_object('city', b.city,\n",
+    "                                  'name', b.name,\n",
+    "                                  'zip', b.zip,\n",
+    "                                  'state', b.state,\n",
+    "                                  'address', b.address)\n",
+    "               from (select DISTINCT l.donor_id as east, r.donor_id as west\n",
+    "                     from blocking_map as l\n",
+    "                     INNER JOIN blocking_map as r\n",
+    "                     using (block_key)\n",
+    "                     where l.donor_id < r.donor_id) ids\n",
+    "               INNER JOIN processed_donors a on ids.east=a.donor_id\n",
+    "               INNER JOIN processed_donors b on ids.west=b.donor_id\n",
+    "               \"\"\")\n",
+    "\n",
+    "        # ## Clustering\n",
+    "\n",
+    "        print('clustering...')\n",
+    "        clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)),\n",
+    "                                          threshold=0.5)\n",
+    "\n",
+    "        with write_con.cursor() as write_cur:\n",
+    "\n",
+    "            # ## Writing out results\n",
+    "\n",
+    "            # We now have a sequence of tuples of donor ids that dedupe believes\n",
+    "            # all refer to the same entity. We write this out onto an entity map\n",
+    "            # table\n",
+    "            write_cur.execute(\"DROP TABLE IF EXISTS entity_map\")\n",
+    "\n",
+    "            print('creating entity_map database')\n",
+    "            write_cur.execute(\"CREATE TABLE entity_map \"\n",
+    "                              \"(donor_id INTEGER, canon_id INTEGER, \"\n",
+    "                              \" cluster_score FLOAT, PRIMARY KEY(donor_id))\")\n",
+    "\n",
+    "            write_cur.executemany('INSERT INTO entity_map VALUES (%s, %s, %s)',\n",
+    "                                  cluster_ids(clustered_dupes))\n",
+    "\n",
+    "    write_con.commit()\n",
+    "\n",
+    "    with write_con.cursor() as cur:\n",
+    "        cur.execute(\"CREATE INDEX head_index ON entity_map (canon_id)\")\n",
+    "\n",
+    "    write_con.commit()\n",
+    "    read_con.commit()\n",
+    "\n",
+    "    # Print out the number of duplicates found\n",
+    "    print('# duplicate sets')\n",
+    "\n",
+    "    # ## Payoff\n",
+    "\n",
+    "    # With all this done, we can now begin to ask interesting questions\n",
+    "    # of the data\n",
+    "    #\n",
+    "    # For example, let's see who the top 10 donors are.\n",
+    "\n",
+    "    locale.setlocale(locale.LC_ALL, '')  # for pretty printing numbers\n",
+    "\n",
+    "    with read_con.cursor() as cur:\n",
+    "        # Create a temporary table so each group and unmatched record has\n",
+    "        # a unique id\n",
+    "        cur.execute(\"CREATE TEMPORARY TABLE e_map \"\n",
+    "                    \"SELECT IFNULL(canon_id, donor_id) AS canon_id, donor_id \"\n",
+    "                    \"FROM entity_map \"\n",
+    "                    \"RIGHT JOIN donors USING(donor_id)\")\n",
+    "\n",
+    "        cur.execute(\"SELECT CONCAT_WS(' ', donors.first_name, donors.last_name) AS name, \"\n",
+    "                    \"donation_totals.totals AS totals \"\n",
+    "                    \"FROM donors INNER JOIN \"\n",
+    "                    \"(SELECT canon_id, SUM(amount) AS totals \"\n",
+    "                    \" FROM contributions INNER JOIN e_map \"\n",
+    "                    \" USING (donor_id) \"\n",
+    "                    \" GROUP BY (canon_id) \"\n",
+    "                    \" ORDER BY totals \"\n",
+    "                    \" DESC LIMIT 10) \"\n",
+    "                    \"AS donation_totals \"\n",
+    "                    \"WHERE donors.donor_id = donation_totals.canon_id\")\n",
+    "\n",
+    "        print(\"Top Donors (deduped)\")\n",
+    "        for row in cur:\n",
+    "            row['totals'] = locale.currency(row['totals'], grouping=True)\n",
+    "            print('%(totals)20s: %(name)s' % row)\n",
+    "\n",
+    "        # Compare this to what we would have gotten if we hadn't done any\n",
+    "        # deduplication\n",
+    "        cur.execute(\"SELECT CONCAT_WS(' ', donors.first_name, donors.last_name) as name, \"\n",
+    "                    \"SUM(contributions.amount) AS totals \"\n",
+    "                    \"FROM donors INNER JOIN contributions \"\n",
+    "                    \"USING (donor_id) \"\n",
+    "                    \"GROUP BY (donor_id) \"\n",
+    "                    \"ORDER BY totals DESC \"\n",
+    "                    \"LIMIT 10\")\n",
+    "\n",
+    "        print(\"Top Donors (raw)\")\n",
+    "        for row in cur:\n",
+    "            row['totals'] = locale.currency(row['totals'], grouping=True)\n",
+    "            print('%(totals)20s: %(name)s' % row)\n",
+    "\n",
+    "        # Close our database connection\n",
+    "    read_con.close()\n",
+    "    write_con.close()\n",
+    "\n",
+    "    print('ran in', time.time() - start_time, 'seconds')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "conda_python3",
+   "language": "python",
+   "name": "conda_python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/mysql_init_db.ipynb b/notebooks/mysql_init_db.ipynb
index 1dd956d9..bb6331a1 100644
--- a/notebooks/mysql_init_db.ipynb
+++ b/notebooks/mysql_init_db.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -15,11 +15,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# %load ../mysql_example/mysql_init_db.py\n",
     "#!/usr/bin/python\n",
     "\"\"\"\n",
     "This is a setup script for mysql_example.  It downloads a zip file of\n",
@@ -55,7 +54,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -78,7 +77,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -96,27 +95,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "importing raw data from csv...\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "<pyathena.cursor.Cursor at 0x7f2aaeeb8ef0>"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "print('importing raw data from csv...')\n",
     "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.raw_table\")\n",
@@ -128,20 +109,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<pyathena.cursor.Cursor at 0x7f2aaeeb8ef0>"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# c.execute(\"CREATE TABLE raw_table \"\n",
     "#           \"(reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), \"\n",
@@ -191,43 +161,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "b'Skipping line 1441352: expected 30 fields, saw 31\\n'\n",
-      "b'Skipping line 1465996: expected 30 fields, saw 31\\n'\n",
-      "b'Skipping line 1495732: expected 30 fields, saw 31\\n'\n",
-      "b'Skipping line 1631504: expected 30 fields, saw 31\\nSkipping line 1631506: expected 30 fields, saw 31\\n'\n",
-      "b'Skipping line 1660260: expected 30 fields, saw 31\\nSkipping line 1660264: expected 30 fields, saw 32\\n'\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "{'ResponseMetadata': {'RequestId': 'C8707997FC007A2B',\n",
-       "  'HostId': 'pD0pZDu7WHeyS6gGA9JAV11Ns6QUZ99Iqjskl4Pvgd2V9cxZf2ulF8azIOgJnvWQ0Tv+DSJniEw=',\n",
-       "  'HTTPStatusCode': 200,\n",
-       "  'HTTPHeaders': {'x-amz-id-2': 'pD0pZDu7WHeyS6gGA9JAV11Ns6QUZ99Iqjskl4Pvgd2V9cxZf2ulF8azIOgJnvWQ0Tv+DSJniEw=',\n",
-       "   'x-amz-request-id': 'C8707997FC007A2B',\n",
-       "   'date': 'Thu, 18 Jun 2020 03:05:54 GMT',\n",
-       "   'x-amz-server-side-encryption': 'AES256',\n",
-       "   'etag': '\"42fa5ce005b346df46ed9bb9aa8fb140\"',\n",
-       "   'content-length': '0',\n",
-       "   'server': 'AmazonS3'},\n",
-       "  'RetryAttempts': 0},\n",
-       " 'ETag': '\"42fa5ce005b346df46ed9bb9aa8fb140\"',\n",
-       " 'ServerSideEncryption': 'AES256'}"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# c.execute(\"LOAD DATA LOCAL INFILE %s INTO TABLE raw_table \"\n",
     "#           \"FIELDS TERMINATED BY '\\t' LINES TERMINATED BY '\\r\\n' \" \n",
@@ -276,27 +212,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "creating donors table...\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "<pyathena.cursor.Cursor at 0x7f2aaeeb8ef0>"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "print('creating donors table...')\n",
     "# c.execute(\"CREATE TABLE donors \"\n",
@@ -337,20 +255,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<pyathena.cursor.Cursor at 0x7f2aaeeb8ef0>"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# print('creating recipients table...')\n",
     "# c.execute(\"CREATE TABLE recipients \"\n",
@@ -370,27 +277,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "creating contributions table\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "<pyathena.cursor.Cursor at 0x7f2aaeeb8ef0>"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "print('creating contributions table')\n",
     "# c.execute(\"CREATE TABLE contributions \"\n",
@@ -481,20 +370,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<pyathena.cursor.Cursor at 0x7f2aaeeb8ef0>"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# c.execute(\"CREATE TABLE processed_donors AS \" \n",
     "#           \"(SELECT donor_id, \" \n",
@@ -551,20 +429,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(1762975, 29)"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "df.shape"
    ]

From 448fdf7c6dd3ddf63e84b9ef4647dc65b7014c57 Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-10-10-27-169.eu-west-1.compute.internal>
Date: Mon, 22 Jun 2020 21:53:59 +0000
Subject: [PATCH 03/19] debugging

---
 notebooks/mysql_example.ipynb | 153 ++--------------------------------
 1 file changed, 6 insertions(+), 147 deletions(-)

diff --git a/notebooks/mysql_example.ipynb b/notebooks/mysql_example.ipynb
index d4ba20ac..2c68caaa 100644
--- a/notebooks/mysql_example.ipynb
+++ b/notebooks/mysql_example.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -18,7 +18,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -140,7 +140,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -178,7 +178,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -194,150 +194,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "name : None\n",
-      "address : None\n",
-      "city : st. louis\n",
-      "state : mo\n",
-      "zip : 63118\n",
-      "\n",
-      "name : None\n",
-      "address : None\n",
-      "city : None\n",
-      "state : il\n",
-      "zip : None\n",
-      "\n",
-      "0/10 positive, 0/10 negative\n",
-      "Do these records refer to the same thing?\n",
-      "(y)es / (n)o / (u)nsure / (f)inished\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "starting active labeling...\n",
-      "y\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "name : nick manousopoulos\n",
-      "address : None\n",
-      "city : calumet city\n",
-      "state : il\n",
-      "zip : 60409\n",
-      "\n",
-      "name : None\n",
-      "address : None\n",
-      "city : None\n",
-      "state : il\n",
-      "zip : None\n",
-      "\n",
-      "1/10 positive, 0/10 negative\n",
-      "Do these records refer to the same thing?\n",
-      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "y\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "name : gary h. smith\n",
-      "address : 205 w. wacker drive suite 510\n",
-      "city : chicago\n",
-      "state : il\n",
-      "zip : 60606\n",
-      "\n",
-      "name : None\n",
-      "address : None\n",
-      "city : chicago\n",
-      "state : il\n",
-      "zip : 60606\n",
-      "\n",
-      "2/10 positive, 0/10 negative\n",
-      "Do these records refer to the same thing?\n",
-      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "y\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "name : sam vinson\n",
-      "address : None\n",
-      "city : chicago\n",
-      "state : il\n",
-      "zip : 60602\n",
-      "\n",
-      "name : john dore\n",
-      "address : 134 n. lasalle #1508\n",
-      "city : chicago\n",
-      "state : il\n",
-      "zip : 60602\n",
-      "\n",
-      "3/10 positive, 0/10 negative\n",
-      "Do these records refer to the same thing?\n",
-      "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "f\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Finished labeling\n"
-     ]
-    },
-    {
-     "ename": "TypeError",
-     "evalue": "79578 is not JSON serializable",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-9-5b994906c6be>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     25\u001b[0m \u001b[0;31m# When finished, save our labeled, training pairs to disk\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     26\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtraining_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'w'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 27\u001b[0;31m     \u001b[0mdeduper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite_training\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/api.py\u001b[0m in \u001b[0;36mwrite_training\u001b[0;34m(self, file_obj)\u001b[0m\n\u001b[1;32m   1059\u001b[0m                   \u001b[0mfile_obj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1060\u001b[0m                   \u001b[0mdefault\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mserializer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_to_json\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1061\u001b[0;31m                   ensure_ascii=True)\n\u001b[0m\u001b[1;32m   1062\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1063\u001b[0m     def write_settings(self,\n",
-      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/__init__.py\u001b[0m in \u001b[0;36mdump\u001b[0;34m(obj, fp, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)\u001b[0m\n\u001b[1;32m    177\u001b[0m     \u001b[0;31m# could accelerate with writelines in some versions of Python, at\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    178\u001b[0m     \u001b[0;31m# a debuggability cost\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 179\u001b[0;31m     \u001b[0;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0miterable\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    180\u001b[0m         \u001b[0mfp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    181\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode\u001b[0;34m(o, _current_indent_level)\u001b[0m\n\u001b[1;32m    428\u001b[0m             \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0m_iterencode_list\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    429\u001b[0m         \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 430\u001b[0;31m             \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0m_iterencode_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    431\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    432\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mmarkers\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode_dict\u001b[0;34m(dct, _current_indent_level)\u001b[0m\n\u001b[1;32m    402\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    403\u001b[0m                     \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_iterencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 404\u001b[0;31m                 \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    405\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mnewline_indent\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    406\u001b[0m             \u001b[0m_current_indent_level\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode_list\u001b[0;34m(lst, _current_indent_level)\u001b[0m\n\u001b[1;32m    323\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    324\u001b[0m                     \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_iterencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 325\u001b[0;31m                 \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    326\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mnewline_indent\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    327\u001b[0m             \u001b[0m_current_indent_level\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode_list\u001b[0;34m(lst, _current_indent_level)\u001b[0m\n\u001b[1;32m    323\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    324\u001b[0m                     \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_iterencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 325\u001b[0;31m                 \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    326\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mnewline_indent\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    327\u001b[0m             \u001b[0m_current_indent_level\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode_dict\u001b[0;34m(dct, _current_indent_level)\u001b[0m\n\u001b[1;32m    402\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    403\u001b[0m                     \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_iterencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 404\u001b[0;31m                 \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    405\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mnewline_indent\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    406\u001b[0m             \u001b[0m_current_indent_level\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode\u001b[0;34m(o, _current_indent_level)\u001b[0m\n\u001b[1;32m    435\u001b[0m                     \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Circular reference detected\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    436\u001b[0m                 \u001b[0mmarkers\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mmarkerid\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mo\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 437\u001b[0;31m             \u001b[0mo\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_default\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    438\u001b[0m             \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0m_iterencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    439\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mmarkers\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/serializer.py\u001b[0m in \u001b[0;36m_to_json\u001b[0;34m(python_object)\u001b[0m\n\u001b[1;32m     19\u001b[0m                          '__value__': list(python_object)}\n\u001b[1;32m     20\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 21\u001b[0;31m         \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrepr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpython_object\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m' is not JSON serializable'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     22\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     23\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mpython_object\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mTypeError\u001b[0m: 79578 is not JSON serializable"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "        # If we have training data saved from a previous run of dedupe,\n",
     "        # look for it an load it in.\n",

From a7a2bfda90975dff4494463a528ff7cd5581975e Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-10-10-27-169.eu-west-1.compute.internal>
Date: Mon, 22 Jun 2020 22:07:59 +0000
Subject: [PATCH 04/19] rename

---
 ...sql_example.ipynb => athena_example.ipynb} | 19 +++++++++++++++----
 ...sql_init_db.ipynb => athena_init_db.ipynb} |  0
 2 files changed, 15 insertions(+), 4 deletions(-)
 rename notebooks/{mysql_example.ipynb => athena_example.ipynb} (96%)
 rename notebooks/{mysql_init_db.ipynb => athena_init_db.ipynb} (100%)

diff --git a/notebooks/mysql_example.ipynb b/notebooks/athena_example.ipynb
similarity index 96%
rename from notebooks/mysql_example.ipynb
rename to notebooks/athena_example.ipynb
index 2c68caaa..3a4ca13a 100644
--- a/notebooks/mysql_example.ipynb
+++ b/notebooks/athena_example.ipynb
@@ -13,7 +13,7 @@
     "root_key='as-dedupe/'\n",
     "import sys\n",
     "sys.path.insert(0, '../../dedupe/')\n",
-    "import mydedupe"
+    "import dedupe"
    ]
   },
   {
@@ -138,6 +138,16 @@
     "cur = conn.cursor(PandasCursor)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!rm 'mysql_example_settings'\n",
+    "!rm 'mysql_example_training.json'"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -173,7 +183,7 @@
     "                  ]\n",
     "\n",
     "        # Create a new deduper object and pass our data model to it.\n",
-    "        deduper = mydedupe.Dedupe(fields, num_cores=4)"
+    "        deduper = dedupe.Dedupe(fields, num_cores=4)"
    ]
   },
   {
@@ -187,8 +197,9 @@
     "#         cur.execute(DONOR_SELECT)\n",
     "#         temp_d = {i: row for i, row in enumerate(cur)}\n",
     "\n",
-    "        #Armin: Very Suspicious, does the ssdictcursor convers everything to string?\n",
-    "        df = cur.execute(DONOR_SELECT).as_pandas()#.astype(str)\n",
+    "        # Armin: The problem is the donor_id, it's numpy's int64, should be converted to int! \n",
+    "        # But for that, astype doesn't work, and a loot on temp_d is slow, so for now let's just use str\n",
+    "        df = cur.execute(DONOR_SELECT).as_pandas().astype(str)\n",
     "        temp_d = df.where(pd.notnull(df), None).to_dict('index')"
    ]
   },
diff --git a/notebooks/mysql_init_db.ipynb b/notebooks/athena_init_db.ipynb
similarity index 100%
rename from notebooks/mysql_init_db.ipynb
rename to notebooks/athena_init_db.ipynb

From 1d7d92ba9d49b027293265fc2069ac124b4f0fb0 Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-10-10-28-85.eu-west-1.compute.internal>
Date: Tue, 30 Jun 2020 17:49:54 +0000
Subject: [PATCH 05/19] init ready

---
 {presto_example => athena_example}/README.md  |   0
 .../mysql.cnf_LOCAL                           |   0
 .../mysql_example.py                          |   0
 .../mysql_init_db.py                          |   0
 .../requirements.txt                          |   0
 notebooks/athena_example.ipynb                | 522 ++++++++++++------
 notebooks/athena_init_db.ipynb                | 334 +++--------
 7 files changed, 427 insertions(+), 429 deletions(-)
 rename {presto_example => athena_example}/README.md (100%)
 rename {presto_example => athena_example}/mysql.cnf_LOCAL (100%)
 rename {presto_example => athena_example}/mysql_example.py (100%)
 rename {presto_example => athena_example}/mysql_init_db.py (100%)
 rename {presto_example => athena_example}/requirements.txt (100%)

diff --git a/presto_example/README.md b/athena_example/README.md
similarity index 100%
rename from presto_example/README.md
rename to athena_example/README.md
diff --git a/presto_example/mysql.cnf_LOCAL b/athena_example/mysql.cnf_LOCAL
similarity index 100%
rename from presto_example/mysql.cnf_LOCAL
rename to athena_example/mysql.cnf_LOCAL
diff --git a/presto_example/mysql_example.py b/athena_example/mysql_example.py
similarity index 100%
rename from presto_example/mysql_example.py
rename to athena_example/mysql_example.py
diff --git a/presto_example/mysql_init_db.py b/athena_example/mysql_init_db.py
similarity index 100%
rename from presto_example/mysql_init_db.py
rename to athena_example/mysql_init_db.py
diff --git a/presto_example/requirements.txt b/athena_example/requirements.txt
similarity index 100%
rename from presto_example/requirements.txt
rename to athena_example/requirements.txt
diff --git a/notebooks/athena_example.ipynb b/notebooks/athena_example.ipynb
index 3a4ca13a..34eb1d68 100644
--- a/notebooks/athena_example.ipynb
+++ b/notebooks/athena_example.ipynb
@@ -5,20 +5,32 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "!pip install dedupe  pyathena"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "athena_garbage = 's3://com.ria.scratch/athena_garbage/'\n",
     "bucket='com.ria.scratch'\n",
     "region='eu-west-1'\n",
     "workgroup = 'RIA'\n",
     "root_key='as-dedupe/'\n",
+    "schema_name='ria_data_science_s3'\n",
     "import sys\n",
     "sys.path.insert(0, '../../dedupe/')\n",
-    "import dedupe"
+    "import dedupe\n",
+    "from io import StringIO\n",
+    "import csv"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -60,6 +72,11 @@
     "from pyathena import connect\n",
     "from pyathena.pandas_cursor import PandasCursor\n",
     "\n",
+    "def dict_cursor_execute(cur, query):\n",
+    "    df = cur.execute(query).as_pandas()\n",
+    "    return df.where(pd.notnull(df), None).astype(str)\n",
+    "\n",
+    "\n",
     "def record_pairs(result_set):\n",
     "    for i, row in enumerate(result_set):\n",
     "        a_record_id, a_record, b_record_id, b_record = row\n",
@@ -132,27 +149,35 @@
     "#                                 charset='utf8',\n",
     "#                                 read_default_file=MYSQL_CNF)\n",
     "\n",
-    "s3 = boto3.client('s3')  \n",
-    "conn = connect(s3_staging_dir=athena_garbage,\n",
-    "                 region_name=region, work_group=workgroup)\n",
-    "cur = conn.cursor(PandasCursor)"
+    "    s3 = boto3.client('s3')  \n",
+    "    conn = connect(s3_staging_dir=athena_garbage,\n",
+    "                     region_name=region, work_group=workgroup)\n",
+    "    cur = conn.cursor(PandasCursor, schema_name=schema_name)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
-    "!rm 'mysql_example_settings'\n",
-    "!rm 'mysql_example_training.json'"
+    "# !rm 'mysql_example_settings'\n",
+    "# !rm 'mysql_example_training.json'"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "reading from  mysql_example_settings\n"
+     ]
+    }
+   ],
    "source": [
     "    # We'll be using variations on this following select statement to pull\n",
     "    # in campaign donor info.\n",
@@ -160,7 +185,7 @@
     "    # We did a fair amount of preprocessing of the fields in\n",
     "    # `mysql_init_db.py`    \n",
     "    DONOR_SELECT = \"SELECT donor_id, city, name, zip, state, address \" \\\n",
-    "                   \"from ria_data_science_s3.processed_donors limit 1000\"\n",
+    "                   \"from processed_donors\"\n",
     "\n",
     "    # ## Training\n",
     "\n",
@@ -183,15 +208,8 @@
     "                  ]\n",
     "\n",
     "        # Create a new deduper object and pass our data model to it.\n",
-    "        deduper = dedupe.Dedupe(fields, num_cores=4)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "        deduper = dedupe.Dedupe(fields, num_cores=4)\n",
+    "\n",
     "        # We will sample pairs from the entire donor table for training\n",
     "#         with read_con.cursor() as cur:\n",
     "#         cur.execute(DONOR_SELECT)\n",
@@ -199,16 +217,13 @@
     "\n",
     "        # Armin: The problem is the donor_id, it's numpy's int64, should be converted to int! \n",
     "        # But for that, astype doesn't work, and a loot on temp_d is slow, so for now let's just use str\n",
-    "        df = cur.execute(DONOR_SELECT).as_pandas().astype(str)\n",
-    "        temp_d = df.where(pd.notnull(df), None).to_dict('index')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "        with conn.cursor(PandasCursor, schema_name=schema_name) as cursor:\n",
+    "        #     Something like this is much faster, but let's  keep the changes minimal for now\n",
+    "        #     df = cur.execute(DONOR_SELECT).as_pandas().astype(str)\n",
+    "        #     temp_d = df.where(pd.notnull(df), None).to_dict('index')\n",
+    "            cursor_df = dict_cursor_execute(cursor, DONOR_SELECT)\n",
+    "            temp_d = cursor_df.to_dict('index')\n",
+    "\n",
     "        # If we have training data saved from a previous run of dedupe,\n",
     "        # look for it an load it in.\n",
     "        #\n",
@@ -235,15 +250,8 @@
     "        dedupe.convenience.console_label(deduper)\n",
     "        # When finished, save our labeled, training pairs to disk\n",
     "        with open(training_file, 'w') as tf:\n",
-    "            deduper.write_training(tf)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "            deduper.write_training(tf)\n",
+    "\n",
     "        # Notice our the argument here\n",
     "        #\n",
     "        # `recall` is the proportion of true dupes pairs that the learned\n",
@@ -261,9 +269,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "blocking...\n",
+      "creating blocking_map database\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<pyathena.pandas_cursor.PandasCursor at 0x7fd9ce27a400>"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "    # ## Blocking\n",
     "\n",
@@ -272,108 +299,243 @@
     "    # To run blocking on such a large set of data, we create a separate table\n",
     "    # that contains blocking keys and record ids\n",
     "    print('creating blocking_map database')\n",
-    "    with write_con.cursor() as cur:\n",
-    "        cur.execute(\"DROP TABLE IF EXISTS blocking_map\")\n",
-    "        cur.execute(\"CREATE TABLE blocking_map \"\n",
-    "                    \"(block_key VARCHAR(200), donor_id INTEGER) \"\n",
-    "                    \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n",
-    "\n",
-    "    write_con.commit()\n",
-    "\n",
+    "#     with write_con.cursor() as cur:\n",
+    "#         cur.execute(\"DROP TABLE IF EXISTS blocking_map\")\n",
+    "#         cur.execute(\"CREATE TABLE blocking_map \"\n",
+    "#                     \"(block_key VARCHAR(200), donor_id INTEGER) \"\n",
+    "#                     \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n",
+    "\n",
+    "#     write_con.commit()\n",
+    "    cur.execute(\"DROP TABLE IF EXISTS blocking_map\")\n",
+    "\n",
+    "    q='''\n",
+    "    CREATE EXTERNAL TABLE blocking_map     \n",
+    "        (block_key VARCHAR(200), donor_id INTEGER)\n",
+    "    ROW FORMAT DELIMITED\n",
+    "      FIELDS TERMINATED BY '\\t'\n",
+    "      LINES TERMINATED BY '\\n'  \n",
+    "    LOCATION\n",
+    "        's3://{}/{}' \n",
+    "    TBLPROPERTIES (\n",
+    "        'classification'='csv', \n",
+    "        --'skip.header.line.count'='1',  \n",
+    "        'serialization.null.format'='')\n",
+    "    '''.format(bucket, root_key+'blocking_map') \n",
+    "    cur.execute(q)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "creating inverted index\n"
+     ]
+    }
+   ],
+   "source": [
     "    # If dedupe learned a Index Predicate, we have to take a pass\n",
     "    # through the data and create indices.\n",
     "    print('creating inverted index')\n",
     "\n",
     "    for field in deduper.fingerprinter.index_fields:\n",
-    "        with read_con.cursor() as cur:\n",
-    "            cur.execute(\"SELECT DISTINCT {field} FROM processed_donors \"\n",
-    "                        \"WHERE {field} IS NOT NULL\".format(field=field))\n",
-    "            field_data = (row[0] for row in cur)\n",
-    "            deduper.fingerprinter.index(field_data, field)\n",
-    "\n",
+    "        q = '''\n",
+    "        SELECT DISTINCT {field} FROM processed_donors \n",
+    "        WHERE {field} IS NOT NULL\n",
+    "        '''.format(field=field)\n",
+    "        cur_df = dict_cursor_execute(cur, q)\n",
+    "        # Do I need to cast it as a list?\n",
+    "        field_data = cur_df[field]\n",
+    "        deduper.fingerprinter.index(field_data, field)\n",
+    "     "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "writing blocking map\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'ResponseMetadata': {'RequestId': '5F215F152B811909',\n",
+       "  'HostId': 'B9k8koPR2pp/7lp5WxlEM2etPGjhR3aUdlJq253YoSf1Rt6N8Jo1XAWrfe7EiplzFf++YlcW238=',\n",
+       "  'HTTPStatusCode': 200,\n",
+       "  'HTTPHeaders': {'x-amz-id-2': 'B9k8koPR2pp/7lp5WxlEM2etPGjhR3aUdlJq253YoSf1Rt6N8Jo1XAWrfe7EiplzFf++YlcW238=',\n",
+       "   'x-amz-request-id': '5F215F152B811909',\n",
+       "   'date': 'Tue, 30 Jun 2020 15:27:16 GMT',\n",
+       "   'x-amz-server-side-encryption': 'AES256',\n",
+       "   'etag': '\"d41d8cd98f00b204e9800998ecf8427e\"',\n",
+       "   'content-length': '0',\n",
+       "   'server': 'AmazonS3'},\n",
+       "  'RetryAttempts': 0},\n",
+       " 'ETag': '\"d41d8cd98f00b204e9800998ecf8427e\"',\n",
+       " 'ServerSideEncryption': 'AES256'}"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
     "    # Now we are ready to write our blocking map table by creating a\n",
     "    # generator that yields unique `(block_key, donor_id)` tuples.\n",
     "    print('writing blocking map')\n",
+    "    \n",
     "\n",
-    "    with read_con.cursor() as read_cur:\n",
-    "        read_cur.execute(DONOR_SELECT)\n",
-    "        full_data = ((row['donor_id'], row) for row in read_cur)\n",
-    "        b_data = deduper.fingerprinter(full_data)\n",
-    "\n",
-    "        with write_con.cursor() as write_cur:\n",
-    "\n",
-    "            write_cur.executemany(\"INSERT INTO blocking_map VALUES (%s, %s)\",\n",
-    "                                  b_data)\n",
-    "\n",
-    "    write_con.commit()\n",
+    "    read_cur_dict = dict_cursor_execute(cur, DONOR_SELECT).to_dict('records')\n",
+    "    full_data = ((row['donor_id'], row) for row in read_cur_dict)\n",
+    "    b_data = deduper.fingerprinter(full_data)\n",
+    "    buffer = pd.DataFrame.from_records(b_data).to_csv(index=False, header=False, sep='\\t')\n",
+    "#         csv_out.writerows(b_data)        \n",
     "\n",
-    "    # Free up memory by removing indices we don't need anymore\n",
-    "    deduper.fingerprinter.reset_indices()\n",
+    "#         \"\\n\".join(b_data)\n",
+    "#         with write_con.cursor() as write_cur:\n",
     "\n",
+    "#             write_cur.executemany(\"INSERT INTO blocking_map VALUES (%s, %s)\",\n",
+    "#                                   b_data)\n",
+    "    s3.put_object(Bucket=bucket, Key=root_key+'blocking_map/blocking.csv', Body=buffer)    \n",
+    "#     write_con.commit()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "    # indexing blocking_map\n",
-    "    print('creating index')\n",
-    "    with write_con.cursor() as cur:\n",
-    "        cur.execute(\"CREATE UNIQUE INDEX bm_idx ON blocking_map (block_key, donor_id)\")\n",
+    "#     print('creating index')\n",
+    "#     with write_con.cursor() as cur:\n",
+    "#         cur.execute(\"CREATE UNIQUE INDEX bm_idx ON blocking_map (block_key, donor_id)\")\n",
     "\n",
-    "    write_con.commit()\n",
-    "    read_con.commit()\n",
+    "#     write_con.commit()\n",
+    "#     read_con.commit()\n",
     "\n",
     "    # select unique pairs to compare\n",
-    "    with read_con.cursor(MySQLdb.cursors.SSCursor) as read_cur:\n",
-    "\n",
-    "        read_cur.execute(\"\"\"\n",
-    "               select a.donor_id,\n",
-    "                      json_object('city', a.city,\n",
-    "                                  'name', a.name,\n",
-    "                                  'zip', a.zip,\n",
-    "                                  'state', a.state,\n",
-    "                                  'address', a.address),\n",
-    "                      b.donor_id,\n",
-    "                      json_object('city', b.city,\n",
-    "                                  'name', b.name,\n",
-    "                                  'zip', b.zip,\n",
-    "                                  'state', b.state,\n",
-    "                                  'address', b.address)\n",
-    "               from (select DISTINCT l.donor_id as east, r.donor_id as west\n",
-    "                     from blocking_map as l\n",
-    "                     INNER JOIN blocking_map as r\n",
-    "                     using (block_key)\n",
-    "                     where l.donor_id < r.donor_id) ids\n",
-    "               INNER JOIN processed_donors a on ids.east=a.donor_id\n",
-    "               INNER JOIN processed_donors b on ids.west=b.donor_id\n",
-    "               \"\"\")\n",
-    "\n",
-    "        # ## Clustering\n",
-    "\n",
-    "        print('clustering...')\n",
-    "        clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)),\n",
-    "                                          threshold=0.5)\n",
-    "\n",
-    "        with write_con.cursor() as write_cur:\n",
-    "\n",
-    "            # ## Writing out results\n",
-    "\n",
-    "            # We now have a sequence of tuples of donor ids that dedupe believes\n",
-    "            # all refer to the same entity. We write this out onto an entity map\n",
-    "            # table\n",
-    "            write_cur.execute(\"DROP TABLE IF EXISTS entity_map\")\n",
-    "\n",
-    "            print('creating entity_map database')\n",
-    "            write_cur.execute(\"CREATE TABLE entity_map \"\n",
-    "                              \"(donor_id INTEGER, canon_id INTEGER, \"\n",
-    "                              \" cluster_score FLOAT, PRIMARY KEY(donor_id))\")\n",
-    "\n",
-    "            write_cur.executemany('INSERT INTO entity_map VALUES (%s, %s, %s)',\n",
-    "                                  cluster_ids(clustered_dupes))\n",
-    "\n",
-    "    write_con.commit()\n",
-    "\n",
-    "    with write_con.cursor() as cur:\n",
-    "        cur.execute(\"CREATE INDEX head_index ON entity_map (canon_id)\")\n",
-    "\n",
-    "    write_con.commit()\n",
-    "    read_con.commit()\n",
+    "    q='''\n",
+    "    SELECT a.donor_id,\n",
+    "        json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'],\n",
+    "                              ARRAY[ a.city, a.name, a.zip, a.state, a.address])\n",
+    "                    AS JSON)),\n",
+    "        b.donor_id,\n",
+    "        json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], \n",
+    "                  ARRAY[ b.city, b.name, b.zip, b.state, b.address])\n",
+    "              AS JSON))\n",
+    "    FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west\n",
+    "         from blocking_map as l\n",
+    "         INNER JOIN blocking_map as r\n",
+    "         using (block_key)\n",
+    "         where l.donor_id < r.donor_id) ids\n",
+    "    INNER JOIN processed_donors a on ids.east=a.donor_id\n",
+    "    INNER JOIN processed_donors b on ids.west=b.donor_id\n",
+    "    '''\n",
+    "    read_cur_dict=dict_cursor_execute(cur, q).itertuples(index=False, name=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "StopIteration",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mStopIteration\u001b[0m                             Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-13-bb1ab6348ed6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mread_cur_dict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;31mStopIteration\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "next(read_cur_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "clustering...\n"
+     ]
+    },
+    {
+     "ename": "BlockingError",
+     "evalue": "No records have been blocked together. Is the data you are trying to match like the data you trained on?",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mBlockingError\u001b[0m                             Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-11-186191d0dae6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'clustering...'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur_dict)),\n\u001b[0m\u001b[1;32m      5\u001b[0m                                       threshold=0.5)\n",
+      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/api.py\u001b[0m in \u001b[0;36mscore\u001b[0;34m(self, pairs)\u001b[0m\n\u001b[1;32m    104\u001b[0m                                            \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata_model\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    105\u001b[0m                                            \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclassifier\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 106\u001b[0;31m                                            self.num_cores)\n\u001b[0m\u001b[1;32m    107\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    108\u001b[0m             raise RuntimeError('''\n",
+      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/core.py\u001b[0m in \u001b[0;36mscoreDuplicates\u001b[0;34m(record_pairs, data_model, classifier, num_cores)\u001b[0m\n\u001b[1;32m    218\u001b[0m     \u001b[0mfirst\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrecord_pairs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpeek\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrecord_pairs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    219\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mfirst\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 220\u001b[0;31m         raise BlockingError(\"No records have been blocked together. \"\n\u001b[0m\u001b[1;32m    221\u001b[0m                             \u001b[0;34m\"Is the data you are trying to match like \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    222\u001b[0m                             \"the data you trained on?\")\n",
+      "\u001b[0;31mBlockingError\u001b[0m: No records have been blocked together. Is the data you are trying to match like the data you trained on?"
+     ]
+    }
+   ],
+   "source": [
+    "    # ## Clustering\n",
     "\n",
+    "    print('clustering...')\n",
+    "    clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur_dict)),\n",
+    "                                          threshold=0.5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "    cur.execute(\"DROP TABLE IF EXISTS entity_map\")\n",
+    "\n",
+    "    print('creating entity_map database')\n",
+    "    q='''\n",
+    "    CREATE EXTERNAL TABLE entity_map     \n",
+    "        (donor_id INTEGER, canon_id INTEGER, \n",
+    "         cluster_score FLOAT)\n",
+    "    ROW FORMAT DELIMITED\n",
+    "      FIELDS TERMINATED BY '\\t'\n",
+    "      LINES TERMINATED BY '\\n'  \n",
+    "    LOCATION\n",
+    "        's3://{}/{}' \n",
+    "    TBLPROPERTIES (\n",
+    "        'classification'='csv', \n",
+    "        --'skip.header.line.count'='1',  \n",
+    "        'serialization.null.format'='')\n",
+    "    '''.format(bucket, root_key+'entity_map') \n",
+    "    cur.execute(q) \n",
+    "\n",
+    "    buffer = pd.DataFrame.from_records(cluster_ids(clustered_dupes)).to_csv(index=False, header=False, sep='\\t')\n",
+    "    s3.put_object(Bucket=bucket, Key=root_key+'entity_map/entity_map.csv', Body=buffer)    \n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "    # Print out the number of duplicates found\n",
     "    print('# duplicate sets')\n",
     "\n",
@@ -385,52 +547,36 @@
     "    # For example, let's see who the top 10 donors are.\n",
     "\n",
     "    locale.setlocale(locale.LC_ALL, '')  # for pretty printing numbers\n",
-    "\n",
-    "    with read_con.cursor() as cur:\n",
-    "        # Create a temporary table so each group and unmatched record has\n",
-    "        # a unique id\n",
-    "        cur.execute(\"CREATE TEMPORARY TABLE e_map \"\n",
-    "                    \"SELECT IFNULL(canon_id, donor_id) AS canon_id, donor_id \"\n",
-    "                    \"FROM entity_map \"\n",
-    "                    \"RIGHT JOIN donors USING(donor_id)\")\n",
-    "\n",
-    "        cur.execute(\"SELECT CONCAT_WS(' ', donors.first_name, donors.last_name) AS name, \"\n",
-    "                    \"donation_totals.totals AS totals \"\n",
-    "                    \"FROM donors INNER JOIN \"\n",
-    "                    \"(SELECT canon_id, SUM(amount) AS totals \"\n",
-    "                    \" FROM contributions INNER JOIN e_map \"\n",
-    "                    \" USING (donor_id) \"\n",
-    "                    \" GROUP BY (canon_id) \"\n",
-    "                    \" ORDER BY totals \"\n",
-    "                    \" DESC LIMIT 10) \"\n",
-    "                    \"AS donation_totals \"\n",
-    "                    \"WHERE donors.donor_id = donation_totals.canon_id\")\n",
-    "\n",
-    "        print(\"Top Donors (deduped)\")\n",
-    "        for row in cur:\n",
-    "            row['totals'] = locale.currency(row['totals'], grouping=True)\n",
-    "            print('%(totals)20s: %(name)s' % row)\n",
-    "\n",
-    "        # Compare this to what we would have gotten if we hadn't done any\n",
-    "        # deduplication\n",
-    "        cur.execute(\"SELECT CONCAT_WS(' ', donors.first_name, donors.last_name) as name, \"\n",
-    "                    \"SUM(contributions.amount) AS totals \"\n",
-    "                    \"FROM donors INNER JOIN contributions \"\n",
-    "                    \"USING (donor_id) \"\n",
-    "                    \"GROUP BY (donor_id) \"\n",
-    "                    \"ORDER BY totals DESC \"\n",
-    "                    \"LIMIT 10\")\n",
-    "\n",
-    "        print(\"Top Donors (raw)\")\n",
-    "        for row in cur:\n",
-    "            row['totals'] = locale.currency(row['totals'], grouping=True)\n",
-    "            print('%(totals)20s: %(name)s' % row)\n",
-    "\n",
-    "        # Close our database connection\n",
-    "    read_con.close()\n",
-    "    write_con.close()\n",
-    "\n",
-    "    print('ran in', time.time() - start_time, 'seconds')"
+    "    \n",
+    "    cur.execute(\"DROP TABLE IF EXISTS e_map\")\n",
+    "\n",
+    "    q = '''\n",
+    "    CREATE TABLE e_map as \n",
+    "        SELECT COALESCE(canon_id, entity_map.donor_id) AS canon_id, entity_map.donor_id \n",
+    "        FROM entity_map \n",
+    "            RIGHT JOIN donors USING(donor_id)\n",
+    "    '''\n",
+    "    \n",
+    "    cur.execute(q)\n",
+    "    q ='''\n",
+    "    SELECT array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name,   \n",
+    "        donation_totals.totals AS totals \n",
+    "    FROM donors INNER JOIN \n",
+    "        (SELECT canon_id, SUM(cast (amount as double)) AS totals \n",
+    "        FROM contributions INNER JOIN e_map \n",
+    "        USING (donor_id) \n",
+    "        GROUP BY (canon_id) \n",
+    "        ORDER BY totals \n",
+    "        DESC LIMIT 10) \n",
+    "        AS donation_totals \n",
+    "    ON donors.donor_id = donation_totals.canon_id\n",
+    "    '''\n",
+    "    cur_dict = dict_cursor_execute(cur, q).to_dict('records')\n",
+    "\n",
+    "    print(\"Top Donors (deduped)\")\n",
+    "    for row in cur_dict:\n",
+    "        row['totals'] = locale.currency(row['totals'], grouping=True)\n",
+    "        print('%(totals)20s: %(name)s' % row)"
    ]
   },
   {
@@ -438,7 +584,33 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "    # Compare this to what we would have gotten if we hadn't done any\n",
+    "    # deduplication\n",
+    "\n",
+    "    q = '''\n",
+    "    SELECT array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name,\n",
+    "        SUM(cast(contributions.amount as double)) AS totals \n",
+    "    FROM donors INNER JOIN contributions \n",
+    "        USING (donor_id) \n",
+    "    GROUP BY donor_id), name\n",
+    "    ORDER BY totals DESC \n",
+    "    LIMIT 10\")\n",
+    "    '''\n",
+    "\n",
+    "    cur_dict = dict_cursor_execute(cur, q).to_dict('records')\n",
+    "\n",
+    "    print(\"Top Donors (raw)\")\n",
+    "    for row in cur:\n",
+    "        row['totals'] = locale.currency(row['totals'], grouping=True)\n",
+    "        print('%(totals)20s: %(name)s' % row)\n",
+    "\n",
+    "    # Close our database connection\n",
+    "#     read_con.close()\n",
+    "#     write_con.close()\n",
+    "\n",
+    "    print('ran in', time.time() - start_time, 'seconds')"
+   ]
   }
  ],
  "metadata": {
diff --git a/notebooks/athena_init_db.ipynb b/notebooks/athena_init_db.ipynb
index bb6331a1..b86d1b9f 100644
--- a/notebooks/athena_init_db.ipynb
+++ b/notebooks/athena_init_db.ipynb
@@ -2,31 +2,54 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting ../athena_example/config.py\n"
+     ]
+    }
+   ],
    "source": [
-    "athena_garbage = 's3://com.ria.scratch/athena_garbage/'\n",
-    "bucket='com.ria.scratch'\n",
-    "region='eu-west-1'\n",
-    "workgroup = 'RIA'\n",
-    "root_key='as-dedupe/'"
+    "%%writefile ../athena_example/config.py\n",
+    "# Connection parameters\n",
+    "ACCESS_KEY_ID = None\n",
+    "SECRET_ACCESS_KEY = None\n",
+    "ATHENA_GARBAGE_PATH = 's3://com.ria.scratch/athena_garbage/'\n",
+    "WORKGROUP = 'RIA'\n",
+    "REGION = 'eu-west-1'\n",
+    "\n",
+    "# Database Parameters\n",
+    "DATABASE_BUCKET = 'com.ria.scratch'\n",
+    "DATABASE_ROOT_KEY = 'as-dedupe/'"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting ../athena_example/athena_example.py\n"
+     ]
+    }
+   ],
    "source": [
+    "%%writefile ../athena_example/athena_example.py\n",
     "#!/usr/bin/python\n",
     "\"\"\"\n",
-    "This is a setup script for mysql_example.  It downloads a zip file of\n",
-    "Illinois campaign contributions and loads them into a MySQL database\n",
+    "This is a setup script for athena_example.  It downloads a zip file of\n",
+    "Illinois campaign contributions and loads them into a Athena database\n",
     "named 'contributions'.\n",
     " \n",
     "__Note:__ You will need to run this script first before execuing\n",
-    "[mysql_example.py](mysql_example.html).\n",
+    "[athena_example.py](athena_example.py).\n",
     " \n",
     "Tables created:\n",
     "* raw_table - raw import of entire CSV file\n",
@@ -43,21 +66,12 @@
     "from urllib.request import urlopen\n",
     "import boto3\n",
     "from pyathena import connect\n",
+    "import config\n",
     "\n",
-    "# import MySQLdb\n",
-    "\n",
-    "# warnings.filterwarnings('ignore', category=MySQLdb.Warning)\n",
     "\n",
     "contributions_zip_file = 'Illinois-campaign-contributions.txt.zip'\n",
-    "contributions_txt_file = 'Illinois-campaign-contributions.txt'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "contributions_txt_file = 'Illinois-campaign-contributions.txt'\n",
+    "\n",
     "if not os.path.exists(contributions_zip_file) :\n",
     "    print('downloading', contributions_zip_file, '(~60mb) ...')\n",
     "    u = urlopen('https://s3.amazonaws.com/dedupe-data/Illinois-campaign-contributions.txt.zip')\n",
@@ -72,65 +86,27 @@
     "    for f in zip_file_contents:\n",
     "        if ('.txt' in f):\n",
     "            zip_file.extract(f)\n",
-    "    zip_file.close()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# conn = MySQLdb.connect(read_default_file = os.path.abspath('.') + '/mysql.cnf', \n",
-    "#                        local_infile = 1,\n",
-    "#                        sql_mode=\"ALLOW_INVALID_DATES\",\n",
-    "#                        db='contributions')\n",
-    "# c = conn.cursor()\n",
+    "    zip_file.close()\n",
+    "\n",
     "\n",
     "s3 = boto3.client('s3')  \n",
-    "conn = connect(s3_staging_dir=athena_garbage,\n",
-    "                 region_name=region, work_group=workgroup)\n",
-    "c = conn.cursor()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "\n",
+    "\n",
+    "conn = connect(aws_access_key_id=config.ACCESS_KEY_ID,\n",
+    "               aws_secret_access_key=config.SECRET_ACCESS_KEY,\n",
+    "               s3_staging_dir=config.ATHENA_GARBAGE_PATH,\n",
+    "               region_name=config.REGION, \n",
+    "               work_group=config.WORKGROUP)\n",
+    "c = conn.cursor()\n",
+    "\n",
     "print('importing raw data from csv...')\n",
     "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.raw_table\")\n",
     "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.donors\")\n",
     "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.recipients\")\n",
     "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.contributions\")\n",
-    "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.processed_donors\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# c.execute(\"CREATE TABLE raw_table \"\n",
-    "#           \"(reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), \"\n",
-    "#           \" address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), \"\n",
-    "#           \" state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), \"\n",
-    "#           \" date_recieved VARCHAR(10), loan_amount VARCHAR(12), \"\n",
-    "#           \" amount VARCHAR(23), receipt_type VARCHAR(23), \"\n",
-    "#           \" employer VARCHAR(70), occupation VARCHAR(40), \"\n",
-    "#           \" vendor_last_name VARCHAR(70), vendor_first_name VARCHAR(20), \"\n",
-    "#           \" vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), \"\n",
-    "#           \" vendor_city VARCHAR(20), vendor_state VARCHAR(10), \"\n",
-    "#           \" vendor_zip VARCHAR(10), description VARCHAR(90), \"\n",
-    "#           \" election_type VARCHAR(10), election_year VARCHAR(10), \"\n",
-    "#           \" report_period_begin VARCHAR(10), report_period_end VARCHAR(33), \"\n",
-    "#           \" committee_name VARCHAR(70), committee_id VARCHAR(37)) \"\n",
-    "#           \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n",
+    "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.processed_donors\")\n",
     "\n",
     "\n",
-    "# conn.commit()\n",
     "q=r'''\n",
     "CREATE EXTERNAL TABLE ria_data_science_s3.raw_table \n",
     "    (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), \n",
@@ -155,85 +131,35 @@
     "    'classification'='csv', \n",
     "    'skip.header.line.count'='1',  \n",
     "    'serialization.null.format'='')\n",
-    "'''.format(bucket, root_key+'raw_table') \n",
-    "c.execute(q)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# c.execute(\"LOAD DATA LOCAL INFILE %s INTO TABLE raw_table \"\n",
-    "#           \"FIELDS TERMINATED BY '\\t' LINES TERMINATED BY '\\r\\n' \" \n",
-    "#           \"IGNORE 1 LINES \"\n",
-    "#           \"(reciept_id, last_name, first_name, \"\n",
-    "#           \" address_1, address_2, city, state, \"\n",
-    "#           \" zip, report_type, date_recieved, \"\n",
-    "#           \" loan_amount, amount, receipt_type, \"\n",
-    "#           \" employer, occupation, vendor_last_name, \"\n",
-    "#           \" vendor_first_name, vendor_address_1, \"\n",
-    "#           \" vendor_address_2, vendor_city, vendor_state, \"\n",
-    "#           \" vendor_zip, description, election_type, \"\n",
-    "#           \" election_year, \"\n",
-    "#           \" report_period_begin, report_period_end, \"\n",
-    "#           \" committee_name, committee_id, @dummy)\",\n",
-    "#           (contributions_txt_file,))\n",
+    "'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'raw_table') \n",
+    "c.execute(q)\n",
+    "\n",
+    "\n",
     "\n",
     "df = pd.read_csv(contributions_txt_file, sep='\\t', error_bad_lines=False, dtype=str, index_col=0)\n",
     "# Remove the very few records that mess up the demo \n",
     "# (demo purposes only! Don't do something like this in production)\n",
-    "# c.execute(\"DELETE FROM raw_table WHERE LENGTH(date_recieved) < 10\")\n",
     "df = df[df['RcvDate'].str.len()>=10]\n",
     "\n",
     "# set empty, non-zero, strings in date columns to null\n",
-    "# c.execute(\"UPDATE raw_table SET report_period_begin = NULL WHERE LENGTH(report_period_begin) < 10\")\n",
     "df.loc[df['RptPdBegDate'].str.len()<10,'RptPdBegDate'] = np.nan\n",
     "\n",
-    "# c.execute(\"UPDATE raw_table SET report_period_end = NULL WHERE LENGTH(report_period_end) < 10\")\n",
     "df.loc[df['RptPdEndDate'].str.len()<10,'RptPdEndDate'] = np.nan\n",
     "\n",
     "#committee ID is requred. Remove the 2 rows that don't have it.\n",
-    "# c.execute(\"DELETE FROM raw_table WHERE committee_id=''\");\n",
     "df = df[df['ID'] != '']\n",
     "\n",
     "# There's a record with a date stuck in the committee_id column, which causes\n",
     "# problems when inserting into the contributions table below. Get rid of it this \n",
     "# way.\n",
-    "# c.execute(\"DELETE FROM raw_table WHERE LENGTH( committee_id ) > 9\")\n",
     "df = df[df['ID'].str.len() <=9]\n",
     "\n",
     "# Nullifying empty strings\n",
     "df = df.replace(r'^\\s*$', np.nan, regex=True)\n",
     "\n",
-    "s3.put_object(Bucket=bucket, Key=root_key+'raw_table/'+contributions_txt_file, Body=df.to_csv(sep=\"\\t\"))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'raw_table/'+contributions_txt_file, Body=df.to_csv(sep=\"\\t\"))\n",
+    "\n",
     "print('creating donors table...')\n",
-    "# c.execute(\"CREATE TABLE donors \"\n",
-    "#           \"(donor_id INTEGER PRIMARY KEY AUTO_INCREMENT, \"\n",
-    "#           \" last_name VARCHAR(70), first_name VARCHAR(35), \"\n",
-    "#           \" address_1 VARCHAR(35), address_2 VARCHAR(36), \"\n",
-    "#           \" city VARCHAR(20), state VARCHAR(15), \"\n",
-    "#           \" zip VARCHAR(11), employer VARCHAR(70), \"\n",
-    "#           \" occupation VARCHAR(40)) \"\n",
-    "#           \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n",
-    "# c.execute(\"INSERT INTO donors \"\n",
-    "#           \"(first_name, last_name, address_1,\"\n",
-    "#           \" address_2, city, state, zip, employer, occupation) \"\n",
-    "#           \"SELECT DISTINCT \"\n",
-    "#           \"TRIM(first_name), TRIM(last_name), TRIM(address_1),  \"\n",
-    "#           \"TRIM(address_2), TRIM(city), TRIM(state), TRIM(zip), \"\n",
-    "#           \"TRIM(employer), TRIM(occupation) \"\n",
-    "#           \"FROM raw_table\")\n",
-    "# conn.commit()\n",
     "q='''\n",
     "CREATE TABLE ria_data_science_s3.donors as\n",
     "    with tmp as\n",
@@ -246,79 +172,15 @@
     "      FROM ria_data_science_s3.raw_table)\n",
     "    SELECT row_number() over () as donor_id, * from tmp'''\n",
     "c.execute(q)\n",
-    "# print('creating indexes on donors table')\n",
-    "# c.execute(\"CREATE INDEX donors_donor_info ON donors \"\n",
-    "#           \"(last_name, first_name, address_1, address_2, city, \"\n",
-    "#           \" state, zip)\")\n",
-    "# conn.commit()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# print('creating recipients table...')\n",
-    "# c.execute(\"CREATE TABLE recipients \"\n",
-    "#           \"(recipient_id INTEGER PRIMARY KEY AUTO_INCREMENT, name VARCHAR(70)) \"\n",
-    "#           \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n",
     "\n",
-    "# c.execute(\"INSERT IGNORE INTO recipients \"\n",
-    "#           \"SELECT DISTINCT committee_id, committee_name FROM raw_table\")\n",
-    "# conn.commit()\n",
     "\n",
     "q='''\n",
     "CREATE TABLE ria_data_science_s3.recipients as\n",
     "    SELECT DISTINCT committee_id, committee_name FROM ria_data_science_s3.raw_table\n",
     "'''\n",
-    "c.execute(q)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print('creating contributions table')\n",
-    "# c.execute(\"CREATE TABLE contributions \"\n",
-    "#           \"(contribution_id INT, donor_id INT, recipient_id INT, \"\n",
-    "#           \" report_type VARCHAR(24), date_recieved DATE, \"\n",
-    "#           \" loan_amount VARCHAR(12), amount VARCHAR(23), \"\n",
-    "#           \" receipt_type VARCHAR(23), \"\n",
-    "#           \" vendor_last_name VARCHAR(70), \"\n",
-    "#           \" vendor_first_name VARCHAR(20), \"\n",
-    "#           \" vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), \"\n",
-    "#           \" vendor_city VARCHAR(20), vendor_state VARCHAR(10), \"\n",
-    "#           \" vendor_zip VARCHAR(10), description VARCHAR(90), \"\n",
-    "#           \" election_type VARCHAR(10), election_year VARCHAR(10), \"\n",
-    "#           \" report_period_begin DATE, report_period_end DATE) \"\n",
-    "#           \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n",
-    "\n",
-    "\n",
-    "# c.execute(\"INSERT INTO contributions \"\n",
-    "#           \"SELECT reciept_id, donors.donor_id, committee_id, \"\n",
-    "#           \" report_type, STR_TO_DATE(date_recieved, '%m/%d/%Y'), \"\n",
-    "#           \" loan_amount, amount, \"\n",
-    "#           \" receipt_type, vendor_last_name , \"\n",
-    "#           \" vendor_first_name, vendor_address_1, vendor_address_2, \"\n",
-    "#           \" vendor_city, vendor_state, vendor_zip, description, \"\n",
-    "#           \" election_type, election_year, \"\n",
-    "#           \" STR_TO_DATE(report_period_begin, '%m/%d/%Y'), \"\n",
-    "#           \" STR_TO_DATE(report_period_end, '%m/%d/%Y') \"\n",
-    "#           \"FROM raw_table JOIN donors ON \"\n",
-    "#           \"donors.first_name = TRIM(raw_table.first_name) AND \"\n",
-    "#           \"donors.last_name = TRIM(raw_table.last_name) AND \"\n",
-    "#           \"donors.address_1 = TRIM(raw_table.address_1) AND \"\n",
-    "#           \"donors.address_2 = TRIM(raw_table.address_2) AND \"\n",
-    "#           \"donors.city = TRIM(raw_table.city) AND \"\n",
-    "#           \"donors.state = TRIM(raw_table.state) AND \"\n",
-    "#           \"donors.employer = TRIM(raw_table.employer) AND \"\n",
-    "#           \"donors.occupation = TRIM(raw_table.occupation) AND \"\n",
-    "#           \"donors.zip = TRIM(raw_table.zip)\")\n",
-    "# conn.commit()\n",
+    "c.execute(q)\n",
     "\n",
+    "print('creating contributions table')\n",
     "q='''\n",
     "CREATE TABLE ria_data_science_s3.contributions as\n",
     "    SELECT reciept_id, donors.donor_id, committee_id, \n",
@@ -342,55 +204,6 @@
     "        donors.zip = TRIM(raw_table.zip)'''\n",
     "c.execute(q)\n",
     "\n",
-    "\n",
-    "# print('creating indexes on contributions')\n",
-    "# c.execute(\"ALTER TABLE contributions ADD PRIMARY KEY(contribution_id)\")\n",
-    "# c.execute(\"CREATE INDEX donor_idx ON contributions (donor_id)\")\n",
-    "# c.execute(\"CREATE INDEX recipient_idx ON contributions (recipient_id)\")\n",
-    "\n",
-    "\n",
-    "# conn.commit()\n",
-    "\n",
-    "# print('nullifying empty strings in donors')\n",
-    "# c.execute(\"UPDATE donors \"\n",
-    "#           \"SET \"\n",
-    "#           \"first_name = CASE first_name WHEN '' THEN NULL ELSE first_name END, \"\n",
-    "#           \"last_name = CASE last_name WHEN '' THEN NULL ELSE last_name END, \"\n",
-    "#           \"address_1 = CASE address_1 WHEN '' THEN NULL ELSE address_1 END, \"\n",
-    "#           \"address_2 = CASE address_2 WHEN '' THEN NULL ELSE address_2 END, \"\n",
-    "#           \"city = CASE city WHEN '' THEN NULL ELSE city END, \"\n",
-    "#           \"state = CASE state WHEN '' THEN NULL ELSE state END, \"\n",
-    "#           \"employer = CASE employer WHEN '' THEN NULL ELSE employer END, \" \n",
-    "#           \"occupation = CASE occupation WHEN '' THEN NULL ELSE occupation END, \" \n",
-    "#           \"zip = CASE zip WHEN '' THEN NULL ELSE zip END\")\n",
-    "\n",
-    "\n",
-    "# conn.commit()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# c.execute(\"CREATE TABLE processed_donors AS \" \n",
-    "#           \"(SELECT donor_id, \" \n",
-    "#           \" LOWER(city) AS city, \" \n",
-    "#           \" CASE WHEN (first_name IS NULL AND last_name IS NULL) \"\n",
-    "#           \"      THEN NULL \"\n",
-    "#           \"      ELSE LOWER(CONCAT_WS(' ', first_name, last_name)) \"\n",
-    "#           \" END AS name, \" \n",
-    "#           \" LOWER(zip) AS zip, \" \n",
-    "#           \" LOWER(state) AS state, \" \n",
-    "#           \" CASE WHEN (address_1 IS NULL AND address_2 IS NULL) \"\n",
-    "#           \"      THEN NULL \"\n",
-    "#           \"      ELSE LOWER(CONCAT_WS(' ', address_1, address_2)) \"\n",
-    "#           \" END AS address, \" \n",
-    "#           \" LOWER(occupation) AS occupation, \"\n",
-    "#           \" LOWER(employer) AS employer, \"\n",
-    "#           \" ISNULL(first_name) AS person \"\n",
-    "#           \" FROM donors)\")\n",
     "q = '''\n",
     "CREATE TABLE ria_data_science_s3.processed_donors AS  \n",
     "    SELECT donor_id,  \n",
@@ -412,19 +225,34 @@
     "c.execute(q)\n",
     "\n",
     "\n",
-    "# c.execute(\"CREATE INDEX donor_idx ON processed_donors (donor_id)\")\n",
     "\n",
-    "# c.close()\n",
-    "# conn.close()"
+    "\n",
+    "print('done')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "importing raw data from csv...\n",
+      "b'Skipping line 1441352: expected 30 fields, saw 31\\n'\n",
+      "b'Skipping line 1465996: expected 30 fields, saw 31\\n'\n",
+      "b'Skipping line 1495732: expected 30 fields, saw 31\\n'\n",
+      "b'Skipping line 1631504: expected 30 fields, saw 31\\nSkipping line 1631506: expected 30 fields, saw 31\\n'\n",
+      "b'Skipping line 1660260: expected 30 fields, saw 31\\nSkipping line 1660264: expected 30 fields, saw 32\\n'\n",
+      "creating donors table...\n",
+      "creating contributions table\n",
+      "done\n"
+     ]
+    }
+   ],
    "source": [
-    "print('done')"
+    "!python ../athena_example/athena_example.py"
    ]
   },
   {
@@ -432,9 +260,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "df.shape"
-   ]
+   "source": []
   }
  ],
  "metadata": {

From 19ff16999bdf5e7455fd358168c9d4c9ea94d5b5 Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-10-10-21-73.eu-west-1.compute.internal>
Date: Thu, 16 Jul 2020 15:57:25 +0000
Subject: [PATCH 06/19] new datasets

---
 notebooks/athena_example.ipynb | 374 ++++++++++++++++++++-------------
 notebooks/athena_init_db.ipynb |  76 ++++---
 2 files changed, 275 insertions(+), 175 deletions(-)

diff --git a/notebooks/athena_example.ipynb b/notebooks/athena_example.ipynb
index 34eb1d68..01c42392 100644
--- a/notebooks/athena_example.ipynb
+++ b/notebooks/athena_example.ipynb
@@ -2,37 +2,134 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting dedupe\n",
+      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/5e/09/179feb316147279c76ea7e6dc5a5f9e00a6feadaeda131d535247e580619/dedupe-2.0.3-cp36-cp36m-manylinux1_x86_64.whl (89kB)\n",
+      "\u001b[K    100% |████████████████████████████████| 92kB 239kB/s ta 0:00:011\n",
+      "\u001b[?25hCollecting pyathena\n",
+      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/40/85/f37c049922f5d47e9126d7817ef7b8fb7abb2e6a9ea0dd06adcbffc0e8bc/PyAthena-1.10.8-py2.py3-none-any.whl (53kB)\n",
+      "\u001b[K    100% |████████████████████████████████| 61kB 1.9MB/s ta 0:00:011\n",
+      "\u001b[?25hCollecting haversine>=0.4.1 (from dedupe)\n",
+      "  Downloading https://files.pythonhosted.org/packages/72/8e/6df8b563dd6b2961a36cd740b34c00b89142f1b97d92092c133379b2973f/haversine-2.2.0-py2.py3-none-any.whl\n",
+      "Collecting simplecosine>=1.2 (from dedupe)\n",
+      "  Downloading https://files.pythonhosted.org/packages/2d/22/6ea3a5ab8aea06d6563eb927e706f7342a00d1849c9be6143a2a7d84ddbd/simplecosine-1.2-py2.py3-none-any.whl\n",
+      "Collecting rlr>=2.4.3 (from dedupe)\n",
+      "  Downloading https://files.pythonhosted.org/packages/fa/02/3b1a9727a622ff4320919645ce35ceb887d90784d0bab41484756c33b7ea/rlr-2.4.5-py2.py3-none-any.whl\n",
+      "Collecting categorical-distance>=1.9 (from dedupe)\n",
+      "  Downloading https://files.pythonhosted.org/packages/1d/b7/4f97771f52c63916f4e4d349a644c2387961592e76070e7310463b2d70a5/categorical_distance-1.9-py3-none-any.whl\n",
+      "Requirement already satisfied: numpy>=1.13 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.14.3)\n",
+      "Collecting fastcluster (from dedupe)\n",
+      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/1e/9d/3d7525a4722ee4a11ad969762d1de53b6dac326b5ac1366221e06958e1d7/fastcluster-1.1.26-cp36-cp36m-manylinux1_x86_64.whl (154kB)\n",
+      "\u001b[K    100% |████████████████████████████████| 163kB 707kB/s ta 0:00:01\n",
+      "\u001b[?25hCollecting highered>=0.2.0 (from dedupe)\n",
+      "  Downloading https://files.pythonhosted.org/packages/81/00/cbd902cfd14ad1992fcdaa11a615d47b36b6136dc690e19b0afa58c7365d/highered-0.2.1-py2.py3-none-any.whl\n",
+      "Collecting dedupe-hcluster (from dedupe)\n",
+      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/b2/1f/c6f6075c2e988b3a1759fabaf91d2f8f2de59c6e607a3fd9a2e06112a0de/dedupe_hcluster-0.3.8-cp36-cp36m-manylinux1_x86_64.whl (531kB)\n",
+      "\u001b[K    100% |████████████████████████████████| 532kB 5.2MB/s ta 0:00:01\n",
+      "\u001b[?25hCollecting BTrees>=4.1.4 (from dedupe)\n",
+      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/48/b3/9ce3b32817db98e8bf20d6873e18ee3ee7feded135434d800b72bf8dfb9f/BTrees-4.7.2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)\n",
+      "\u001b[K    100% |████████████████████████████████| 3.0MB 8.2MB/s eta 0:00:01\n",
+      "\u001b[?25hCollecting Levenshtein-search (from dedupe)\n",
+      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/93/89/dc320196d10447540c95f58eab5dd316a2166310356c1d88b84724f4e793/Levenshtein_search-1.4.5-cp36-cp36m-manylinux1_x86_64.whl (59kB)\n",
+      "\u001b[K    100% |████████████████████████████████| 61kB 21.2MB/s ta 0:00:01\n",
+      "\u001b[?25hCollecting zope.index (from dedupe)\n",
+      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/ab/0f/f93bddfac1189bb6b973142da3ef2caa6817a59b07ca448095a30b644737/zope.index-5.0.0-cp36-cp36m-manylinux1_x86_64.whl (101kB)\n",
+      "\u001b[K    100% |████████████████████████████████| 102kB 17.6MB/s a 0:00:01\n",
+      "\u001b[?25hCollecting typing-extensions (from dedupe)\n",
+      "  Downloading https://files.pythonhosted.org/packages/0c/0e/3f026d0645d699e7320b59952146d56ad7c374e9cd72cd16e7c74e657a0f/typing_extensions-3.7.4.2-py3-none-any.whl\n",
+      "Collecting affinegap>=1.3 (from dedupe)\n",
+      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/b2/6a/91f5defe8178104449bc897208c9780b159575d16a959a5074f0bf39a6f0/affinegap-1.11-cp36-cp36m-manylinux1_x86_64.whl (45kB)\n",
+      "\u001b[K    100% |████████████████████████████████| 51kB 12.0MB/s ta 0:00:01\n",
+      "\u001b[?25hCollecting doublemetaphone (from dedupe)\n",
+      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/c0/27/8df369334aac64755ca899b9a7cc4d2d60e800cca148322ef19309cdae0f/DoubleMetaphone-0.1-cp36-cp36m-manylinux1_x86_64.whl (78kB)\n",
+      "\u001b[K    100% |████████████████████████████████| 81kB 3.4MB/s eta 0:00:01\n",
+      "\u001b[?25hCollecting dedupe-variable-datetime (from dedupe)\n",
+      "  Downloading https://files.pythonhosted.org/packages/65/8f/d21f6acadcdfd681ee038153883b5673b8b76f790e465d791780e6b7bf60/dedupe_variable_datetime-0.1.5-py3-none-any.whl\n",
+      "Collecting tenacity>=4.1.0 (from pyathena)\n",
+      "  Downloading https://files.pythonhosted.org/packages/b5/05/ff089032442058bd3386f9cd991cd88ccac81dca1494d78751621ee35e62/tenacity-6.2.0-py2.py3-none-any.whl\n",
+      "Requirement already satisfied: botocore>=1.5.52 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pyathena) (1.15.39)\n",
+      "Collecting future (from pyathena)\n",
+      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/45/0b/38b06fd9b92dc2b68d58b75f900e97884c45bedd2ff83203d933cf5851c9/future-0.18.2.tar.gz (829kB)\n",
+      "\u001b[K    100% |████████████████████████████████| 829kB 14.2MB/s ta 0:00:01\n",
+      "\u001b[?25hRequirement already satisfied: boto3>=1.4.4 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pyathena) (1.12.39)\n",
+      "Collecting pylbfgs (from rlr>=2.4.3->dedupe)\n",
+      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/b8/5b/b8e1ef62e5e5b034ce5ae919b64158ec8da4f64c995444aec7fd96e8ec42/PyLBFGS-0.2.0.13-cp36-cp36m-manylinux1_x86_64.whl (205kB)\n",
+      "\u001b[K    100% |████████████████████████████████| 215kB 16.4MB/s ta 0:00:01\n",
+      "\u001b[?25hCollecting pyhacrf-datamade>=0.2.0 (from highered>=0.2.0->dedupe)\n",
+      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/84/f5/971e17a8b6686d5fc3d562e29e9c902743eb5f0f4436880b86cb11c0149c/pyhacrf_datamade-0.2.5-cp36-cp36m-manylinux1_x86_64.whl (788kB)\n",
+      "\u001b[K    100% |████████████████████████████████| 798kB 14.5MB/s ta 0:00:01\n",
+      "\u001b[?25hCollecting zope.interface (from BTrees>=4.1.4->dedupe)\n",
+      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/fc/7e/8e1efcfa22b722a0d6e992172ab15a871988c290cb722fe8da6d11f1aeb2/zope.interface-5.1.0-cp36-cp36m-manylinux1_x86_64.whl (234kB)\n",
+      "\u001b[K    100% |████████████████████████████████| 235kB 16.6MB/s ta 0:00:01\n",
+      "\u001b[?25hCollecting persistent>=4.1.0 (from BTrees>=4.1.4->dedupe)\n",
+      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/2e/4e/9bde9a2f63273f2e63a94a8198781aac559cc6efd2f560d69afcb0d9d8b5/persistent-4.6.4-cp36-cp36m-manylinux1_x86_64.whl (246kB)\n",
+      "\u001b[K    100% |████████████████████████████████| 256kB 17.5MB/s ta 0:00:01\n",
+      "\u001b[?25hRequirement already satisfied: six in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from zope.index->dedupe) (1.11.0)\n",
+      "Requirement already satisfied: setuptools in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from zope.index->dedupe) (39.1.0)\n",
+      "Collecting datetime-distance (from dedupe-variable-datetime->dedupe)\n",
+      "  Downloading https://files.pythonhosted.org/packages/6b/98/a5eff9256ff27e3bb8030466dabd772002e5014b9237cbeb18c542050ff5/datetime_distance-0.1.3-py3-none-any.whl\n",
+      "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from botocore>=1.5.52->pyathena) (2.7.3)\n",
+      "Requirement already satisfied: docutils<0.16,>=0.10 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from botocore>=1.5.52->pyathena) (0.14)\n",
+      "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from botocore>=1.5.52->pyathena) (0.9.4)\n",
+      "Requirement already satisfied: urllib3<1.26,>=1.20; python_version != \"3.4\" in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from botocore>=1.5.52->pyathena) (1.23)\n",
+      "Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from boto3>=1.4.4->pyathena) (0.3.3)\n",
+      "Requirement already satisfied: cffi; platform_python_implementation == \"CPython\" in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from persistent>=4.1.0->BTrees>=4.1.4->dedupe) (1.11.5)\n",
+      "Requirement already satisfied: pycparser in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from cffi; platform_python_implementation == \"CPython\"->persistent>=4.1.0->BTrees>=4.1.4->dedupe) (2.18)\n",
+      "Building wheels for collected packages: future\n",
+      "  Running setup.py bdist_wheel for future ... \u001b[?25ldone\n",
+      "\u001b[?25h  Stored in directory: /home/ec2-user/.cache/pip/wheels/8b/99/a0/81daf51dcd359a9377b110a8a886b3895921802d2fc1b2397e\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Successfully built future\n",
+      "Installing collected packages: haversine, simplecosine, future, pylbfgs, rlr, categorical-distance, fastcluster, pyhacrf-datamade, highered, dedupe-hcluster, zope.interface, persistent, BTrees, Levenshtein-search, zope.index, typing-extensions, affinegap, doublemetaphone, datetime-distance, dedupe-variable-datetime, dedupe, tenacity, pyathena\n",
+      "Successfully installed BTrees-4.7.2 Levenshtein-search-1.4.5 affinegap-1.11 categorical-distance-1.9 datetime-distance-0.1.3 dedupe-2.0.3 dedupe-hcluster-0.3.8 dedupe-variable-datetime-0.1.5 doublemetaphone-0.1 fastcluster-1.1.26 future-0.18.2 haversine-2.2.0 highered-0.2.1 persistent-4.6.4 pyathena-1.10.8 pyhacrf-datamade-0.2.5 pylbfgs-0.2.0.13 rlr-2.4.5 simplecosine-1.2 tenacity-6.2.0 typing-extensions-3.7.4.2 zope.index-5.0.0 zope.interface-5.1.0\n",
+      "\u001b[33mYou are using pip version 10.0.1, however version 20.2b1 is available.\n",
+      "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n"
+     ]
+    }
+   ],
    "source": [
     "!pip install dedupe  pyathena"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
-    "athena_garbage = 's3://com.ria.scratch/athena_garbage/'\n",
-    "bucket='com.ria.scratch'\n",
-    "region='eu-west-1'\n",
-    "workgroup = 'RIA'\n",
-    "root_key='as-dedupe/'\n",
-    "schema_name='ria_data_science_s3'\n",
     "import sys\n",
-    "sys.path.insert(0, '../../dedupe/')\n",
-    "import dedupe\n",
-    "from io import StringIO\n",
-    "import csv"
+    "sys.path.insert(0, '../athena_example/')\n",
+    "import config\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "module 'logging' has no attribute 'logging'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-3-878b2de91830>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     85\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     86\u001b[0m \u001b[0;31m## Armin\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 87\u001b[0;31m     \u001b[0mlog_level\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlogging\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogging\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDEBUG\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     88\u001b[0m \u001b[0;31m#######\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     89\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mAttributeError\u001b[0m: module 'logging' has no attribute 'logging'"
+     ]
+    }
+   ],
    "source": [
     "# %load ../mysql_example/mysql_example.py\n",
     "#!/usr/bin/python\n",
@@ -61,16 +158,18 @@
     "import optparse\n",
     "import locale\n",
     "import json\n",
+    "from io import StringIO\n",
+    "import csv\n",
     "import pandas as pd\n",
     "\n",
     "# import MySQLdb\n",
     "# import MySQLdb.cursors\n",
     "\n",
-    "import dedupe\n",
-    "import dedupe.backport\n",
     "import boto3\n",
     "from pyathena import connect\n",
     "from pyathena.pandas_cursor import PandasCursor\n",
+    "import dedupe\n",
+    "import dedupe.backport\n",
     "\n",
     "def dict_cursor_execute(cur, query):\n",
     "    df = cur.execute(query).as_pandas()\n",
@@ -118,7 +217,7 @@
     "#             log_level = logging.DEBUG\n",
     "\n",
     "## Armin\n",
-    "    log_level = logging.WARNING\n",
+    "    log_level = logging.DEBUG\n",
     "#######\n",
     "\n",
     "    logging.getLogger().setLevel(log_level)\n",
@@ -150,14 +249,17 @@
     "#                                 read_default_file=MYSQL_CNF)\n",
     "\n",
     "    s3 = boto3.client('s3')  \n",
-    "    conn = connect(s3_staging_dir=athena_garbage,\n",
-    "                     region_name=region, work_group=workgroup)\n",
-    "    cur = conn.cursor(PandasCursor, schema_name=schema_name)"
+    "    conn = connect(aws_access_key_id=config.ACCESS_KEY_ID,\n",
+    "                   aws_secret_access_key=config.SECRET_ACCESS_KEY,\n",
+    "                   s3_staging_dir=config.ATHENA_GARBAGE_PATH,\n",
+    "                   region_name=config.REGION, \n",
+    "                   work_group=config.WORKGROUP)    \n",
+    "    cur = conn.cursor(PandasCursor, schema_name=config.SCHEMA_NAME)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -167,17 +269,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "reading from  mysql_example_settings\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "    # We'll be using variations on this following select statement to pull\n",
     "    # in campaign donor info.\n",
@@ -269,28 +363,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "blocking...\n",
-      "creating blocking_map database\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "<pyathena.pandas_cursor.PandasCursor at 0x7fd9ce27a400>"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "    # ## Blocking\n",
     "\n",
@@ -320,23 +395,15 @@
     "        'classification'='csv', \n",
     "        --'skip.header.line.count'='1',  \n",
     "        'serialization.null.format'='')\n",
-    "    '''.format(bucket, root_key+'blocking_map') \n",
+    "    '''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'blocking_map') \n",
     "    cur.execute(q)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "creating inverted index\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "    # If dedupe learned a Index Predicate, we have to take a pass\n",
     "    # through the data and create indices.\n",
@@ -356,39 +423,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "writing blocking map\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "{'ResponseMetadata': {'RequestId': '5F215F152B811909',\n",
-       "  'HostId': 'B9k8koPR2pp/7lp5WxlEM2etPGjhR3aUdlJq253YoSf1Rt6N8Jo1XAWrfe7EiplzFf++YlcW238=',\n",
-       "  'HTTPStatusCode': 200,\n",
-       "  'HTTPHeaders': {'x-amz-id-2': 'B9k8koPR2pp/7lp5WxlEM2etPGjhR3aUdlJq253YoSf1Rt6N8Jo1XAWrfe7EiplzFf++YlcW238=',\n",
-       "   'x-amz-request-id': '5F215F152B811909',\n",
-       "   'date': 'Tue, 30 Jun 2020 15:27:16 GMT',\n",
-       "   'x-amz-server-side-encryption': 'AES256',\n",
-       "   'etag': '\"d41d8cd98f00b204e9800998ecf8427e\"',\n",
-       "   'content-length': '0',\n",
-       "   'server': 'AmazonS3'},\n",
-       "  'RetryAttempts': 0},\n",
-       " 'ETag': '\"d41d8cd98f00b204e9800998ecf8427e\"',\n",
-       " 'ServerSideEncryption': 'AES256'}"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "    # Now we are ready to write our blocking map table by creating a\n",
     "    # generator that yields unique `(block_key, donor_id)` tuples.\n",
@@ -396,7 +433,22 @@
     "    \n",
     "\n",
     "    read_cur_dict = dict_cursor_execute(cur, DONOR_SELECT).to_dict('records')\n",
-    "    full_data = ((row['donor_id'], row) for row in read_cur_dict)\n",
+    "    full_data = ((row['donor_id'], row) for row in read_cur_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "    b_data = deduper.fingerprinter(full_data)\n",
     "    buffer = pd.DataFrame.from_records(b_data).to_csv(index=False, header=False, sep='\\t')\n",
     "#         csv_out.writerows(b_data)        \n",
@@ -406,13 +458,13 @@
     "\n",
     "#             write_cur.executemany(\"INSERT INTO blocking_map VALUES (%s, %s)\",\n",
     "#                                   b_data)\n",
-    "    s3.put_object(Bucket=bucket, Key=root_key+'blocking_map/blocking.csv', Body=buffer)    \n",
+    "    s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'blocking_map/blocking.csv', Body=buffer)    \n",
     "#     write_con.commit()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -447,51 +499,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "StopIteration",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mStopIteration\u001b[0m                             Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-13-bb1ab6348ed6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mread_cur_dict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;31mStopIteration\u001b[0m: "
-     ]
-    }
-   ],
-   "source": [
-    "next(read_cur_dict)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "clustering...\n"
-     ]
-    },
-    {
-     "ename": "BlockingError",
-     "evalue": "No records have been blocked together. Is the data you are trying to match like the data you trained on?",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mBlockingError\u001b[0m                             Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-11-186191d0dae6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'clustering...'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur_dict)),\n\u001b[0m\u001b[1;32m      5\u001b[0m                                       threshold=0.5)\n",
-      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/api.py\u001b[0m in \u001b[0;36mscore\u001b[0;34m(self, pairs)\u001b[0m\n\u001b[1;32m    104\u001b[0m                                            \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata_model\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    105\u001b[0m                                            \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclassifier\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 106\u001b[0;31m                                            self.num_cores)\n\u001b[0m\u001b[1;32m    107\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    108\u001b[0m             raise RuntimeError('''\n",
-      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/core.py\u001b[0m in \u001b[0;36mscoreDuplicates\u001b[0;34m(record_pairs, data_model, classifier, num_cores)\u001b[0m\n\u001b[1;32m    218\u001b[0m     \u001b[0mfirst\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrecord_pairs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpeek\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrecord_pairs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    219\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mfirst\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 220\u001b[0;31m         raise BlockingError(\"No records have been blocked together. \"\n\u001b[0m\u001b[1;32m    221\u001b[0m                             \u001b[0;34m\"Is the data you are trying to match like \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    222\u001b[0m                             \"the data you trained on?\")\n",
-      "\u001b[0;31mBlockingError\u001b[0m: No records have been blocked together. Is the data you are trying to match like the data you trained on?"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "    # ## Clustering\n",
     "\n",
@@ -611,6 +621,82 @@
     "\n",
     "    print('ran in', time.time() - start_time, 'seconds')"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# I'm here\n",
+    "Found a way to map block_key to block_numbers\n",
+    "** CREATE TABLE, according to some thing online, has more timeout!\n",
+    "** Looks like i should be using (bucketing)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Problem:\n",
+    "The athena mapping doesn't have many distinct values, a huge number for example have 6061:None:2, while there is only one like this in sql!?\n",
+    "The problem, probably was probably address, the concat was buggy and there were too many nulls.\n",
+    "Still while raw table matches, donors don't! The athena is too much bigger\n",
+    "Start from here: Run this query on both, the results are different"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "create table as_blocking_map_number\n",
+    "with (bucketed_by = block_number)\n",
+    "as( \n",
+    "    SELECT donor_id, dense_rank() over (ORDER BY block_key) as block_number\n",
+    "    from blocking_map)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "import sys\n",
+    "sys.path.insert(0, '../athena_example/')\n",
+    "from pyathena import connect\n",
+    "from pyathena.pandas_cursor import PandasCursor\n",
+    "\n",
+    "import config\n",
+    "\n",
+    "conn = connect(aws_access_key_id=config.ACCESS_KEY_ID,\n",
+    "               aws_secret_access_key=config.SECRET_ACCESS_KEY,\n",
+    "               s3_staging_dir=config.ATHENA_GARBAGE_PATH,\n",
+    "               region_name=config.REGION, \n",
+    "               work_group=config.WORKGROUP)    \n",
+    "cur = conn.cursor(PandasCursor, schema_name=config.SCHEMA_NAME)\n",
+    "q='''\n",
+    "with blocking_map_number as( \n",
+    "    SELECT donor_id, dense_rank() over (ORDER BY block_key) as block_number\n",
+    "    from blocking_map)\n",
+    "create table donor_id_pairs as (\n",
+    "    SELECT DISTINCT l.donor_id as east, r.donor_id as west\n",
+    "    from blocking_map_number as l\n",
+    "    INNER JOIN blocking_map_number as r\n",
+    "    using (block_number)\n",
+    "    where l.donor_id < r.donor_id)\n",
+    "'''\n",
+    "cur.execute(q)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -629,7 +715,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.5"
+   "version": "3.6.10"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/athena_init_db.ipynb b/notebooks/athena_init_db.ipynb
index b86d1b9f..5e8a5a32 100644
--- a/notebooks/athena_init_db.ipynb
+++ b/notebooks/athena_init_db.ipynb
@@ -21,6 +21,7 @@
     "ATHENA_GARBAGE_PATH = 's3://com.ria.scratch/athena_garbage/'\n",
     "WORKGROUP = 'RIA'\n",
     "REGION = 'eu-west-1'\n",
+    "SCHEMA_NAME = 'ria_data_science_s3'\n",
     "\n",
     "# Database Parameters\n",
     "DATABASE_BUCKET = 'com.ria.scratch'\n",
@@ -29,7 +30,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -67,6 +68,7 @@
     "import boto3\n",
     "from pyathena import connect\n",
     "import config\n",
+    "import csv\n",
     "\n",
     "\n",
     "contributions_zip_file = 'Illinois-campaign-contributions.txt.zip'\n",
@@ -89,26 +91,27 @@
     "    zip_file.close()\n",
     "\n",
     "\n",
-    "s3 = boto3.client('s3')  \n",
     "\n",
+    "def as_pandas(query, **kwrgs):\n",
+    "    return utils.athena_to_panda(query, escapechar='\\\\', dtype='object', keep_default_na=False, na_values=[''], **kwrgs)\n",
     "\n",
     "conn = connect(aws_access_key_id=config.ACCESS_KEY_ID,\n",
     "               aws_secret_access_key=config.SECRET_ACCESS_KEY,\n",
     "               s3_staging_dir=config.ATHENA_GARBAGE_PATH,\n",
     "               region_name=config.REGION, \n",
     "               work_group=config.WORKGROUP)\n",
-    "c = conn.cursor()\n",
+    "c = conn.cursor(schema_name=config.SCHEMA_NAME)\n",
     "\n",
     "print('importing raw data from csv...')\n",
-    "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.raw_table\")\n",
-    "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.donors\")\n",
-    "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.recipients\")\n",
-    "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.contributions\")\n",
-    "c.execute(\"DROP TABLE IF EXISTS ria_data_science_s3.processed_donors\")\n",
+    "utils.athena_start_query(\"DROP TABLE IF EXISTS raw_table\")\n",
+    "utils.athena_start_query(\"DROP TABLE IF EXISTS donors\")\n",
+    "utils.athena_start_query(\"DROP TABLE IF EXISTS recipients\")\n",
+    "utils.athena_start_query(\"DROP TABLE IF EXISTS contributions\")\n",
+    "utils.athena_start_query(\"DROP TABLE IF EXISTS processed_donors\")\n",
     "\n",
     "\n",
     "q=r'''\n",
-    "CREATE EXTERNAL TABLE ria_data_science_s3.raw_table \n",
+    "CREATE EXTERNAL TABLE raw_table \n",
     "    (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), \n",
     "    address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), \n",
     "    state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), \n",
@@ -125,6 +128,7 @@
     "ROW FORMAT DELIMITED\n",
     "  FIELDS TERMINATED BY '\\t'\n",
     "  LINES TERMINATED BY '\\n'  \n",
+    "  ESCAPED BY '\\\\'\n",
     "LOCATION\n",
     "    's3://{}/{}' \n",
     "TBLPROPERTIES (\n",
@@ -132,11 +136,12 @@
     "    'skip.header.line.count'='1',  \n",
     "    'serialization.null.format'='')\n",
     "'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'raw_table') \n",
-    "c.execute(q)\n",
+    "utils.athena_start_query(q)\n",
     "\n",
     "\n",
+    "df = pd.read_csv(contributions_txt_file, sep='\\t', escapechar='\\\\', quoting=csv.QUOTE_NONE,  \n",
+    "                 error_bad_lines=False, warn_bad_lines=True, dtype=str, keep_default_na=False, na_values=[''])#,\n",
     "\n",
-    "df = pd.read_csv(contributions_txt_file, sep='\\t', error_bad_lines=False, dtype=str, index_col=0)\n",
     "# Remove the very few records that mess up the demo \n",
     "# (demo purposes only! Don't do something like this in production)\n",
     "df = df[df['RcvDate'].str.len()>=10]\n",
@@ -147,21 +152,25 @@
     "df.loc[df['RptPdEndDate'].str.len()<10,'RptPdEndDate'] = np.nan\n",
     "\n",
     "#committee ID is requred. Remove the 2 rows that don't have it.\n",
-    "df = df[df['ID'] != '']\n",
+    "df = df[df['ID']!='']\n",
     "\n",
     "# There's a record with a date stuck in the committee_id column, which causes\n",
     "# problems when inserting into the contributions table below. Get rid of it this \n",
     "# way.\n",
     "df = df[df['ID'].str.len() <=9]\n",
     "\n",
-    "# Nullifying empty strings\n",
-    "df = df.replace(r'^\\s*$', np.nan, regex=True)\n",
+    "# dropping the last columns\n",
+    "df = df.drop(columns='Unnamed: 29')\n",
     "\n",
-    "s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'raw_table/'+contributions_txt_file, Body=df.to_csv(sep=\"\\t\"))\n",
+    "# Nullifying empty strings\n",
+    "# df = df.replace(r'^\\s*$', np.nan, regex=True)\n",
+    "df_lower=df.apply(lambda x: x.str.lower() if x.dtype=='object' else x, result_type='expand')\n",
+    "utils.write(body=df_lower.to_csv(quoting=csv.QUOTE_NONE, sep=\"\\t\", escapechar='\\\\', index=None),\n",
+    "           filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'raw_table', contributions_txt_file,))\n",
     "\n",
     "print('creating donors table...')\n",
     "q='''\n",
-    "CREATE TABLE ria_data_science_s3.donors as\n",
+    "CREATE TABLE donors as\n",
     "    with tmp as\n",
     "      (SELECT DISTINCT \n",
     "           TRIM(last_name) as last_name, TRIM(first_name) as first_name, \n",
@@ -169,20 +178,20 @@
     "           TRIM(city) city, TRIM(state) as state, \n",
     "           TRIM(zip) as zip, TRIM(employer) as employer, \n",
     "           TRIM(occupation) as occupation\n",
-    "      FROM ria_data_science_s3.raw_table)\n",
+    "      FROM raw_table)\n",
     "    SELECT row_number() over () as donor_id, * from tmp'''\n",
-    "c.execute(q)\n",
+    "utils.athena_start_query(q)\n",
     "\n",
     "\n",
     "q='''\n",
-    "CREATE TABLE ria_data_science_s3.recipients as\n",
-    "    SELECT DISTINCT committee_id, committee_name FROM ria_data_science_s3.raw_table\n",
+    "CREATE TABLE recipients as\n",
+    "    SELECT DISTINCT committee_id, committee_name FROM raw_table\n",
     "'''\n",
-    "c.execute(q)\n",
+    "utils.athena_start_query(q)\n",
     "\n",
     "print('creating contributions table')\n",
     "q='''\n",
-    "CREATE TABLE ria_data_science_s3.contributions as\n",
+    "CREATE TABLE contributions as\n",
     "    SELECT reciept_id, donors.donor_id, committee_id, \n",
     "        report_type, date_parse(date_recieved, '%m/%d/%Y') as date_recieved, \n",
     "        loan_amount, amount, \n",
@@ -192,7 +201,7 @@
     "        election_type, election_year, \n",
     "        date_parse(report_period_begin, '%m/%d/%Y') as report_period_begin, \n",
     "        date_parse(report_period_end, '%m/%d/%Y') as report_period_end \n",
-    "    FROM ria_data_science_s3.raw_table JOIN ria_data_science_s3.donors ON \n",
+    "    FROM raw_table JOIN donors ON \n",
     "        donors.first_name = TRIM(raw_table.first_name) AND \n",
     "        donors.last_name = TRIM(raw_table.last_name) AND \n",
     "        donors.address_1 = TRIM(raw_table.address_1) AND \n",
@@ -202,27 +211,27 @@
     "        donors.employer = TRIM(raw_table.employer) AND \n",
     "        donors.occupation = TRIM(raw_table.occupation) AND \n",
     "        donors.zip = TRIM(raw_table.zip)'''\n",
-    "c.execute(q)\n",
+    "utils.athena_start_query(q)\n",
     "\n",
     "q = '''\n",
-    "CREATE TABLE ria_data_science_s3.processed_donors AS  \n",
+    "CREATE TABLE processed_donors AS  \n",
     "    SELECT donor_id,  \n",
     "     LOWER(city) AS city,  \n",
     "     CASE WHEN (first_name IS NULL AND last_name IS NULL) \n",
     "          THEN NULL \n",
-    "          ELSE LOWER(CONCAT(first_name, ' ', last_name)) \n",
+    "          ELSE LOWER(array_join(filter(array[first_name, last_name], x-> x IS NOT NULL), ' ')) \n",
     "     END AS name,  \n",
     "     LOWER(zip) AS zip,  \n",
     "     LOWER(state) AS state,  \n",
     "     CASE WHEN (address_1 IS NULL AND address_2 IS NULL) \n",
     "          THEN NULL \n",
-    "          ELSE LOWER(CONCAT(address_1, ' ', address_2)) \n",
+    "          ELSE LOWER(array_join(filter(array[address_1, address_1], x-> x IS NOT NULL), ' '))\n",
     "     END AS address,  \n",
     "     LOWER(occupation) AS occupation, \n",
     "     LOWER(employer) AS employer, \n",
     "     first_name is null AS person \n",
-    " FROM ria_data_science_s3.donors'''\n",
-    "c.execute(q)\n",
+    " FROM donors'''\n",
+    "utils.athena_start_query(q)\n",
     "\n",
     "\n",
     "\n",
@@ -232,7 +241,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -245,6 +254,11 @@
       "b'Skipping line 1495732: expected 30 fields, saw 31\\n'\n",
       "b'Skipping line 1631504: expected 30 fields, saw 31\\nSkipping line 1631506: expected 30 fields, saw 31\\n'\n",
       "b'Skipping line 1660260: expected 30 fields, saw 31\\nSkipping line 1660264: expected 30 fields, saw 32\\n'\n",
+      "b'Skipping line 1441352: expected 30 fields, saw 31\\n'\n",
+      "b'Skipping line 1465996: expected 30 fields, saw 31\\n'\n",
+      "b'Skipping line 1495732: expected 30 fields, saw 31\\n'\n",
+      "b'Skipping line 1631504: expected 30 fields, saw 31\\nSkipping line 1631506: expected 30 fields, saw 31\\n'\n",
+      "b'Skipping line 1660260: expected 30 fields, saw 31\\nSkipping line 1660264: expected 30 fields, saw 32\\n'\n",
       "creating donors table...\n",
       "creating contributions table\n",
       "done\n"
@@ -279,7 +293,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.5"
+   "version": "3.6.10"
   }
  },
  "nbformat": 4,

From 409527de23e14336245cc9683ef3d55e3e9d852a Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-10-10-8-11.eu-west-1.compute.internal>
Date: Fri, 31 Jul 2020 06:25:17 +0000
Subject: [PATCH 07/19] athena-example, first version

---
 .gitignore                       |   1 +
 README.md                        |  62 ++---
 athena_example/README.md         |  15 +-
 athena_example/athena_example.py | 411 +++++++++++++++++++++++++++++++
 athena_example/athena_init.py    | 215 ++++++++++++++++
 athena_example/config.py         |  12 +
 athena_example/mysql.cnf_LOCAL   |   4 -
 athena_example/mysql_example.py  | 344 --------------------------
 athena_example/mysql_init_db.py  | 234 ------------------
 athena_example/utils.py          | 138 +++++++++++
 notebooks/athena_example.ipynb   | 408 +++++++-----------------------
 notebooks/athena_init_db.ipynb   |  99 +++++---
 12 files changed, 941 insertions(+), 1002 deletions(-)
 create mode 100644 athena_example/athena_example.py
 create mode 100644 athena_example/athena_init.py
 create mode 100644 athena_example/config.py
 delete mode 100644 athena_example/mysql.cnf_LOCAL
 delete mode 100644 athena_example/mysql_example.py
 delete mode 100644 athena_example/mysql_init_db.py
 create mode 100644 athena_example/utils.py

diff --git a/.gitignore b/.gitignore
index 3fb24683..a29de92b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,3 +26,4 @@ ENV
 distpgsql_init_db.py
 pgsql_example/pgsql_init_db.py
 .idea
+.ipynb_checkpoints*
diff --git a/README.md b/README.md
index bb1fe2c8..82abad3d 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,12 @@
 # Dedupe Examples
 
-Example scripts for the [dedupe](https://github.com/dedupeio/dedupe), a library that uses machine learning to perform de-duplication and entity resolution quickly on structured data.
+Adding Athena Example scripts for the [dedupe](https://github.com/dedupeio/dedupe), a library that uses machine learning to perform de-duplication and entity resolution quickly on structured data.
 
 Part of the [Dedupe.io](https://dedupe.io/) cloud service and open source toolset for de-duplicating and finding fuzzy matches in your data. For more details, see the [differences between Dedupe.io and the dedupe library](https://dedupe.io/documentation/should-i-use-dedupeio-or-the-dedupe-python-library.html).
 
-To get these examples:
+To get the athena examples:
 ```bash
-git clone https://github.com/dedupeio/dedupe-examples.git
+git clone https://github.com/asajadi/dedupe-examples.git
 cd dedupe-examples
 ```
 
@@ -34,61 +34,29 @@ Afterwards, whenever you want to work on dedupe-examples,
 workon dedupe-examples
 ```
 
-### [CSV example](https://dedupeio.github.io/dedupe-examples/docs/csv_example.html) - early childhood locations
 
-This example works with a list of early childhood education sites in Chicago from 10 different sources.
+### [athena example](https://dedupeio.github.io/dedupe-examples/docs/mysql_example.html) - IL campaign contributions
 
-```bash
-cd csv_example
-pip install unidecode
-python csv_example.py
-```
-  (use 'y', 'n' and 'u' keys to flag duplicates for active learning, 'f' when you are finished)
-
-**To see how you might use dedupe with smallish data, see the [annotated source code for csv_example.py](https://dedupeio.github.io/dedupe-examples/docs/csv_example.html).**
-
-### [Patent example](https://dedupeio.github.io/dedupe-examples/docs/patent_example.html) -  patent holders
-
-This example works with Dutch inventors from the PATSTAT international patent data file
-
-```bash
-cd patent_example
-pip install unidecode
-python patent_example.py
-```
-  (use 'y', 'n' and 'u' keys to flag duplicates for active learning, 'f' when you are finished)
+Takes a database of IL campaign contribution data, loads it in to a
+Athena database, and identifies the unique donors. 
 
-### [Record Linkage example](https://dedupeio.github.io/dedupe-examples/docs/record_linkage_example.html) -  electronics products
-This example links two spreadsheets of electronics products and links up the matching entries. Each dataset individually has no duplicates.
+To follow this example you need to 
 
-```bash
-cd record_linkage_example
-python record_linkage_example.py
-```
+* Create a Athena database called 'contributions'
+* Update `athena_example/config.py` with your Athena credentials
+* Install dependencies, `pip install -r requirements.txt`
 
-**To see how you might use dedupe for linking datasets, see the [annotated source code for record_linkage_example.py](https://dedupeio.github.io/dedupe-examples/docs/record_linkage_example.html).**
-
-### [Gazetteer example](https://dedupeio.github.io/dedupe-examples/docs/gazetteer_example.html) -  electronics products
-This example links two spreadsheets of electronics products and links up the matching entries using the Gazetteer class
+Once that's all done you can run the example:
 
 ```bash
-cd gazetteer_example.py
-python gazetteer_example.py
+cd mysql_example
+python athena_init_db.py 
+python athena_example.py
 ```
 
+  (use 'y', 'n' and 'u' keys to flag duplicates for active learning, 'f' when you are finished) 
 
-### [MySQL example](https://dedupeio.github.io/dedupe-examples/docs/mysql_example.html) - IL campaign contributions
-
-See `mysql_example/README.md` for details
-
-**To see how you might use dedupe with bigish data, see the [annotated source code for mysql_example](https://dedupeio.github.io/dedupe-examples/docs/mysql_example.html).**
-
-
-### [PostgreSQL big dedupe example](https://dedupeio.github.io/dedupe-examples/docs/pgsql_big_dedupe_example.html) - PostgreSQL example on large dataset
-
-See `pgsql_big_dedupe_example/README.md` for details
 
-This is the same example as the MySQL IL campaign contributions dataset above, but ported to run on PostgreSQL.
 
 
 ## Training
diff --git a/athena_example/README.md b/athena_example/README.md
index a027b3b0..3530935d 100644
--- a/athena_example/README.md
+++ b/athena_example/README.md
@@ -1,23 +1,20 @@
-# MySQL Example
+# Athena Example
 
 Takes a database of IL campaign contribution data, loads it in to a
-MySQL database, and identifies the unique donors. This can take a few
-hours and will noticeably tax your laptop. You might want to run it
-overnight.
+Athena database, and identifies the unique donors. 
 
 To follow this example you need to 
 
-* Create a MySQL database called 'contributions'
-* Copy `mysql_example/mysql.cnf_LOCAL` to `mysql_example/mysql.cnf`
-* Update `mysql_example/mysql.cnf` with your MySQL username and password
+* Create a Athena database called 'contributions'
+* Update `athena_example/config.py` with your Athena credentials
 * Install dependencies, `pip install -r requirements.txt`
 
 Once that's all done you can run the example:
 
 ```bash
 cd mysql_example
-python mysql_init_db.py 
-python mysql_example.py
+python athena_init_db.py 
+python athena_example.py
 ```
 
   (use 'y', 'n' and 'u' keys to flag duplicates for active learning, 'f' when you are finished) 
diff --git a/athena_example/athena_example.py b/athena_example/athena_example.py
new file mode 100644
index 00000000..a738c56c
--- /dev/null
+++ b/athena_example/athena_example.py
@@ -0,0 +1,411 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[ ]:
+
+
+# %load ../mysql_example/mysql_example.py
+#!/usr/bin/python
+
+"""
+This is an example of working with very large data. There are about
+700,000 unduplicated donors in this database of Illinois political
+campaign contributions.
+
+With such a large set of input data, we cannot store all the comparisons
+we need to make in memory. Instead, we will read the pairs on demand
+from the MySQL database.
+
+__Note:__ You will need to run `python mysql_init_db.py`
+before running this script. See the annotates source for
+[mysql_init_db.py](mysql_init_db.html)
+
+For smaller datasets (<10,000), see our
+[csv_example](csv_example.html)
+"""
+
+# There is a little bit difference between the result 
+# of this module and the mysql one. The reason is due to
+# Some special (and mostly erroneous) characters, such as \a .. 
+# Which are dealt with differently by mysql and athena/panda
+
+import sys
+import os
+import itertools
+import time
+import logging
+import optparse
+import locale
+import json
+from io import StringIO
+import csv
+import pandas as pd
+
+import boto3
+import dedupe
+import dedupe.backport
+sys.path.insert(0, '../athena_example/')
+import config
+sys.path.insert(0, '../athena_example/')
+import utils
+
+def as_pandas(query, **kwrgs):
+    df = utils.athena_to_panda(query, escapechar=None, keep_default_na=False, na_values=[''], **kwrgs)
+    return df.where(pd.notnull(df), None)
+
+def record_pairs(result_set):
+    for i, row in enumerate(result_set):
+        a_record_id, a_record, b_record_id, b_record = row
+        record_a = (a_record_id, json.loads(a_record))
+        record_b = (b_record_id, json.loads(b_record))
+
+        yield record_a, record_b
+
+        if i % 10000 == 0:
+            print(i)
+
+
+def cluster_ids(clustered_dupes):
+
+    for cluster, scores in clustered_dupes:
+        cluster_id = cluster[0]
+        for donor_id, score in zip(cluster, scores):
+            yield donor_id, cluster_id, score
+
+
+if __name__ == '__main__':
+
+    # ## Logging
+
+    # Dedupe uses Python logging to show or suppress verbose output. Added
+    # for convenience.  To enable verbose output, run `python
+    # examples/mysql_example/mysql_example.py -v`
+    
+    optp = optparse.OptionParser()
+    optp.add_option('-v', '--verbose', dest='verbose', action='count',
+                    help='Increase verbosity (specify multiple times for more)'
+                    )
+    (opts, args) = optp.parse_args()
+    log_level = logging.WARNING
+    if opts.verbose:
+        if opts.verbose == 1:
+            log_level = logging.INFO
+        elif opts.verbose >= 2:
+            log_level = logging.DEBUG
+
+
+    logging.getLogger().setLevel(log_level)
+
+    
+
+
+    settings_file = 'mysql_example_settings'
+    training_file = 'mysql_example_training.json'
+
+    start_time = time.time()
+
+
+# In[ ]:
+
+
+# We'll be using variations on this following select statement to pull
+# in campaign donor info.
+#
+# We did a fair amount of preprocessing of the fields in
+# `mysql_init_db.py`    
+DONOR_SELECT = "SELECT donor_id, city, name, zip, state, address "                "from processed_donors"
+
+# ## Training
+
+if os.path.exists(settings_file):
+    print('reading from ', settings_file)
+    with open(settings_file, 'rb') as sf:
+        deduper = dedupe.StaticDedupe(sf, num_cores=4)
+else:
+    # Define the fields dedupe will pay attention to
+    #
+    # The address, city, and zip fields are often missing, so we'll
+    # tell dedupe that, and we'll learn a model that take that into
+    # account
+    fields = [{'field': 'name', 'type': 'String'},
+              {'field': 'address', 'type': 'String',
+               'has missing': True},
+              {'field': 'city', 'type': 'ShortString', 'has missing': True},
+              {'field': 'state', 'type': 'ShortString', 'has missing': True},
+              {'field': 'zip', 'type': 'ShortString', 'has missing': True},
+              ]
+
+    # Create a new deduper object and pass our data model to it.
+    deduper = dedupe.Dedupe(fields, num_cores=4)
+
+    # We will sample pairs from the entire donor table for training
+#         with read_con.cursor() as cur:
+
+    # Armin: The problem is the donor_id, it's numpy's int64, should be converted to int! 
+    # But for that, astype doesn't work, and a loop on temp_d is slow, so for now let's just use str
+#         with conn.cursor(PandasCursor, schema_name=schema_name) as cursor:
+    temp_df = as_pandas(DONOR_SELECT)
+    temp_d = temp_df.to_dict('index')
+        
+
+    # If we have training data saved from a previous run of dedupe,
+    # look for it an load it in.
+    #
+    # __Note:__ if you want to train from
+    # scratch, delete the training_file
+    if os.path.exists(training_file):
+        print('reading labeled examples from ', training_file)
+        with open(training_file) as tf:
+            deduper.prepare_training(temp_d, training_file=tf)
+    else:
+        deduper.prepare_training(temp_d)
+
+    del temp_d
+
+    # ## Active learning
+
+    print('starting active labeling...')
+    # Starts the training loop. Dedupe will find the next pair of records
+    # it is least certain about and ask you to label them as duplicates
+    # or not.
+
+    # use 'y', 'n' and 'u' keys to flag duplicates
+    # press 'f' when you are finished
+    dedupe.convenience.console_label(deduper)
+    # When finished, save our labeled, training pairs to disk
+    with open(training_file, 'w') as tf:
+        deduper.write_training(tf)
+
+    # Notice our the argument here
+    #
+    # `recall` is the proportion of true dupes pairs that the learned
+    # rules must cover. You may want to reduce this if your are making
+    # too many blocks and too many comparisons.
+    deduper.train(recall=0.90)
+
+    with open(settings_file, 'wb') as sf:
+        deduper.write_settings(sf)
+
+    # We can now remove some of the memory hobbing objects we used
+    # for training
+    deduper.cleanup_training()
+
+
+# In[ ]:
+
+
+# ## Blocking
+
+print('blocking...')
+
+# To run blocking on such a large set of data, we create a separate table
+# that contains blocking keys and record ids
+print('creating blocking_map database')
+utils.athena_start_query("DROP TABLE IF EXISTS blocking_map")
+
+q='''
+CREATE EXTERNAL TABLE blocking_map     
+    (block_key VARCHAR(200), donor_id INTEGER)
+ROW FORMAT DELIMITED
+  FIELDS TERMINATED BY '\t'
+  LINES TERMINATED BY '\n'  
+LOCATION
+    's3://{}/{}' 
+TBLPROPERTIES (
+    'classification'='csv', 
+    --'skip.header.line.count'='1',  
+    'serialization.null.format'='')
+'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'blocking_map') 
+utils.athena_start_query(q)
+
+
+# In[ ]:
+
+
+# If dedupe learned a Index Predicate, we have to take a pass
+# through the data and create indices.
+print('creating inverted index')
+
+# Armin: 
+# This never runs, index_fields is empty, possible bug?
+for field in deduper.fingerprinter.index_fields:
+    q = '''
+    SELECT DISTINCT {field} FROM processed_donors 
+    WHERE {field} IS NOT NULL
+    '''.format(field=field)
+    cur_df = as_pandas(q)
+    # Do I need to cast it as a list?
+    field_data = cur_df[field]
+    deduper.fingerprinter.index(field_data, field)
+ 
+
+
+# In[ ]:
+
+
+# Now we are ready to write our blocking map table by creating a
+# generator that yields unique `(block_key, donor_id)` tuples.
+print('writing blocking map')
+
+
+read_cur_dict = as_pandas(DONOR_SELECT).to_dict('records')
+full_data = ((row['donor_id'], row) for row in read_cur_dict)
+
+
+# In[ ]:
+
+
+b_data = deduper.fingerprinter(full_data)
+buffer = pd.DataFrame.from_records(b_data).to_csv(index=False, header=False, sep='\t')    utils.s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'blocking_map/blocking.csv', Body=buffer)    
+
+
+# In[ ]:
+
+
+
+    # select unique pairs to compare
+    q='''
+    SELECT a.donor_id,
+        json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'],
+                              ARRAY[ a.city, a.name, a.zip, a.state, a.address])
+                    AS JSON)),
+        b.donor_id,
+        json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], 
+                  ARRAY[ b.city, b.name, b.zip, b.state, b.address])
+              AS JSON))
+    FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west
+         from blocking_map as l
+         INNER JOIN blocking_map as r
+         using (block_key)
+         where l.donor_id < r.donor_id) ids
+    INNER JOIN processed_donors a on ids.east=a.donor_id
+    INNER JOIN processed_donors b on ids.west=b.donor_id
+    '''
+    read_cur_dict=as_pandas(q).itertuples(index=False, name=None)
+
+
+# In[ ]:
+
+
+# ## Clustering
+
+print('clustering...')
+clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur_dict)),
+                                  threshold=0.5)
+
+
+# In[ ]:
+
+
+utils.athena_start_query("DROP TABLE IF EXISTS entity_map")
+
+print('creating entity_map database')
+q='''
+CREATE EXTERNAL TABLE entity_map     
+    (donor_id INTEGER, canon_id INTEGER, 
+     cluster_score FLOAT)
+ROW FORMAT DELIMITED
+  FIELDS TERMINATED BY '\t'
+  LINES TERMINATED BY '\n'  
+LOCATION
+    's3://{}/{}' 
+TBLPROPERTIES (
+    'classification'='csv', 
+    --'skip.header.line.count'='1',  
+    'serialization.null.format'='')
+'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'entity_map') 
+utils.athena_start_query(q) 
+
+buffer = pd.DataFrame.from_records(cluster_ids(clustered_dupes)).to_csv(index=False, header=False, sep='\t')
+utils.s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'entity_map/entity_map.csv', Body=buffer)    
+
+
+# In[ ]:
+
+
+# Print out the number of duplicates found
+print('# duplicate sets')
+
+# ## Payoff
+
+# With all this done, we can now begin to ask interesting questions
+# of the data
+#
+# For example, let's see who the top 10 donors are.
+
+locale.setlocale(locale.LC_ALL, 'en_CA.UTF-8')  # for pretty printing numbers
+
+utils.athena_start_query("DROP TABLE IF EXISTS e_map")
+q = '''
+CREATE TABLE e_map as 
+    SELECT COALESCE(canon_id, entity_map.donor_id) AS canon_id, entity_map.donor_id 
+    FROM entity_map 
+        RIGHT JOIN donors USING(donor_id)
+'''
+
+utils.athena_start_query(q)
+q ='''
+SELECT array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name,   
+    donation_totals.totals AS totals 
+FROM donors INNER JOIN 
+    (SELECT canon_id, SUM(cast (amount as double)) AS totals 
+    FROM contributions INNER JOIN e_map 
+    USING (donor_id) 
+    GROUP BY (canon_id) 
+    ORDER BY totals 
+    DESC LIMIT 10) 
+    AS donation_totals 
+ON donors.donor_id = donation_totals.canon_id
+ORDER BY totals DESC
+'''
+cur_dict = as_pandas(q).to_dict('records')
+
+print("Top Donors (deduped)")
+for row in cur_dict:
+    row['totals'] = locale.currency(row['totals'], grouping=True)
+    print('%(totals)20s: %(name)s' % row)
+
+# Compare this to what we would have gotten if we hadn't done any
+# deduplication
+
+q = '''
+with donorscontributions as(
+
+    SELECT donors.donor_id, 
+        array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name,
+        cast(contributions.amount as double) as amount
+    FROM donors INNER JOIN contributions 
+        USING (donor_id) 
+)
+SELECT name, sum(amount) AS totals  
+FROM donorscontributions
+GROUP BY donor_id, name
+ORDER BY totals DESC 
+LIMIT 10
+'''
+
+cur_dict = as_pandas(q).to_dict('records')
+
+print("Top Donors (raw)")
+for row in cur_dict:
+    row['totals'] = locale.currency(row['totals'], grouping=True)
+    print('%(totals)20s: %(name)s' % row)
+
+# Close our database connection
+#     read_con.close()
+#     write_con.close()
+
+print('ran in', time.time() - start_time, 'seconds')
+
+
+# In[9]:
+
+
+get_ipython().system('jupyter nbconvert --to script athena_example.ipynb --output-dir=../athena_example/')
+
+
+# In[ ]:
+
+
+
+
diff --git a/athena_example/athena_init.py b/athena_example/athena_init.py
new file mode 100644
index 00000000..9ddb14c8
--- /dev/null
+++ b/athena_example/athena_init.py
@@ -0,0 +1,215 @@
+#!/usr/bin/python
+"""
+This is a setup script for athena_example.  It downloads a zip file of
+Illinois campaign contributions and loads them into a Athena database
+named 'contributions'.
+ 
+__Note:__ You will need to run this script first before execuing
+[athena_example.py](athena_example.py).
+ 
+Tables created:
+* raw_table - raw import of entire CSV file
+* donors - all distinct donors based on name and address
+* recipients - all distinct campaign contribution recipients
+* contributions - contribution amounts tied to donor and recipients tables
+"""
+
+import os
+import zipfile
+import warnings
+import pandas as pd
+import numpy as np
+from urllib.request import urlopen
+import boto3
+import config
+import csv
+import sys
+sys.path.insert(0, '../athena_example/')
+import utils
+
+
+contributions_zip_file = 'Illinois-campaign-contributions.txt.zip'
+contributions_txt_file = 'Illinois-campaign-contributions.txt'
+
+if not os.path.exists(contributions_zip_file) :
+    print('downloading', contributions_zip_file, '(~60mb) ...')
+    u = urlopen('https://s3.amazonaws.com/dedupe-data/Illinois-campaign-contributions.txt.zip')
+    localFile = open(contributions_zip_file, 'wb')
+    localFile.write(u.read())
+    localFile.close()
+
+if not os.path.exists(contributions_txt_file) :
+    zip_file = zipfile.ZipFile(contributions_zip_file, 'r')
+    print('extracting %s' % contributions_zip_file)
+    zip_file_contents = zip_file.namelist()
+    for f in zip_file_contents:
+        if ('.txt' in f):
+            zip_file.extract(f)
+    zip_file.close()
+
+
+
+
+print('importing raw data from csv...')
+utils.athena_start_query("DROP TABLE IF EXISTS raw_table")
+utils.athena_start_query("DROP TABLE IF EXISTS donors")
+utils.athena_start_query("DROP TABLE IF EXISTS recipients")
+utils.athena_start_query("DROP TABLE IF EXISTS contributions")
+utils.athena_start_query("DROP TABLE IF EXISTS processed_donors")
+
+
+q=r'''
+CREATE EXTERNAL TABLE raw_table 
+    (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), 
+    address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), 
+    state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), 
+    date_recieved VARCHAR(10), loan_amount VARCHAR(12), 
+    amount VARCHAR(23), receipt_type VARCHAR(23), 
+    employer VARCHAR(70), occupation VARCHAR(40), 
+    vendor_last_name VARCHAR(70), vendor_first_name VARCHAR(20), 
+    vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), 
+    vendor_city VARCHAR(20), vendor_state VARCHAR(10), 
+    vendor_zip VARCHAR(10), description VARCHAR(90), 
+    election_type VARCHAR(10), election_year VARCHAR(10), 
+    report_period_begin VARCHAR(10), report_period_end VARCHAR(33), 
+    committee_name VARCHAR(70), committee_id VARCHAR(37)) 
+ROW FORMAT DELIMITED
+  FIELDS TERMINATED BY '\t'
+  ESCAPED BY '\\'
+  LINES TERMINATED BY '\n'  
+LOCATION
+    's3://{}/{}' 
+TBLPROPERTIES (
+    'classification'='csv', 
+    'skip.header.line.count'='1',  
+    'serialization.null.format'='')
+'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'raw_table') 
+utils.athena_start_query(q)
+
+
+df = pd.read_csv(contributions_txt_file, sep='\t', escapechar='\\', quoting=csv.QUOTE_NONE,  
+                 error_bad_lines=False, warn_bad_lines=True, dtype=str, keep_default_na=False, na_values=[''])#,
+
+# Remove the very few records that mess up the demo 
+# (demo purposes only! Don't do something like this in production)
+df = df[df['RcvDate'].str.len()>=10]
+
+# set empty, non-zero, strings in date columns to null
+df.loc[df['RptPdBegDate'].str.len()<10,'RptPdBegDate'] = np.nan
+
+df.loc[df['RptPdEndDate'].str.len()<10,'RptPdEndDate'] = np.nan
+
+#committee ID is requred. Remove the 2 rows that don't have it.
+df = df[df['ID']!='']
+
+# There's a record with a date stuck in the committee_id column, which causes
+# problems when inserting into the contributions table below. Get rid of it this 
+# way.
+df = df[df['ID'].str.len() <=9]
+
+# dropping the last columns
+df = df.drop(columns='Unnamed: 29')
+
+# Nullifying empty strings
+# df = df.replace(r'^\s*$', np.nan, regex=True)
+df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand')
+
+utils.write(body=df_lower.to_csv(quoting=csv.QUOTE_NONE, sep="\t", escapechar='\\', index=None),
+           filename=os.path.join("s3://", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'raw_table', contributions_txt_file,))
+
+# Athena is doesn't equate empty string and null, eventhough in the table spec we said so
+# Not that it's a bug, it works if the string is null in the source, but not after applying trim to it
+# So we need to manually take care of that
+print('creating donors table...')
+q='''
+CREATE TABLE donors as
+    with tmp as
+      (SELECT DISTINCT 
+           NULLIF(TRIM(last_name), '') as last_name, 
+           NULLIF(TRIM(first_name), '') as first_name, 
+           NULLIF(TRIM(address_1), '') as address_1, 
+           NULLIF(TRIM(address_2), '') as address_2, 
+           NULLIF(TRIM(city), '') city, 
+           NULLIF(TRIM(state), '') as state, 
+           NULLIF(TRIM(zip), '') as zip, 
+           NULLIF(TRIM(employer), '') as employer, 
+           NULLIF(TRIM(occupation), '') as occupation
+      FROM raw_table)
+    SELECT row_number() over () as donor_id, * from tmp'''
+utils.athena_start_query(q)
+
+
+q='''
+CREATE TABLE recipients as
+    SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM raw_table
+'''
+utils.athena_start_query(q)
+
+print('creating contributions table')
+
+# --
+# c.execute("CREATE TABLE contributions "
+#           "(contribution_id INT, donor_id INT, recipient_id INT, "
+#           " report_type VARCHAR(24), date_recieved DATE, "
+#           " loan_amount VARCHAR(12), amount VARCHAR(23), "
+#           " receipt_type VARCHAR(23), "
+#           " vendor_last_name VARCHAR(70), "
+#           " vendor_first_name VARCHAR(20), "
+#           " vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), "
+#           " vendor_city VARCHAR(20), vendor_state VARCHAR(10), "
+#           " vendor_zip VARCHAR(10), description VARCHAR(90), "
+#           " election_type VARCHAR(10), election_year VARCHAR(10), "
+#           " report_period_begin DATE, report_period_end DATE) "
+#           "CHARACTER SET utf8 COLLATE utf8_unicode_ci")
+# --
+
+q='''
+CREATE TABLE contributions as
+    SELECT reciept_id as contribution_id, 
+        donors.donor_id as donor_id , 
+        committee_id as recipient_id, 
+        report_type, date_parse(date_recieved, '%m/%d/%Y') as date_recieved, 
+        loan_amount, amount, 
+        receipt_type, vendor_last_name , 
+        vendor_first_name, vendor_address_1, vendor_address_2, 
+        vendor_city, vendor_state, vendor_zip, description, 
+        election_type, election_year, 
+        date_parse(report_period_begin, '%m/%d/%Y') as report_period_begin, 
+        date_parse(report_period_end, '%m/%d/%Y') as report_period_end 
+    FROM raw_table JOIN donors ON 
+        coalesce(donors.first_name, '') = coalesce(TRIM(raw_table.first_name), '') AND 
+        coalesce(donors.last_name, '') = coalesce(TRIM(raw_table.last_name), '') AND 
+        coalesce(donors.address_1, '') = coalesce(TRIM(raw_table.address_1), '') AND 
+        coalesce(donors.address_2, '') = coalesce(TRIM(raw_table.address_2), '') AND 
+        coalesce(donors.city, '') = coalesce(TRIM(raw_table.city), '') AND 
+        coalesce(donors.state, '') = coalesce(TRIM(raw_table.state), '') AND 
+        coalesce(donors.employer, '') = coalesce(TRIM(raw_table.employer), '') AND 
+        coalesce(donors.occupation , '')= coalesce(TRIM(raw_table.occupation), '') AND 
+        coalesce(donors.zip, '') = coalesce(TRIM(raw_table.zip), '')'''
+
+utils.athena_start_query(q)
+
+q = '''
+CREATE TABLE processed_donors AS  
+    SELECT donor_id,  
+     LOWER(city) AS city,  
+     CASE WHEN (first_name IS NULL AND last_name IS NULL) 
+          THEN NULL 
+          ELSE LOWER(array_join(filter(array[first_name, last_name], x-> x IS NOT NULL), ' ')) 
+     END AS name,  
+     LOWER(zip) AS zip,  
+     LOWER(state) AS state,  
+     CASE WHEN (address_1 IS NULL AND address_2 IS NULL) 
+          THEN NULL 
+          ELSE LOWER(array_join(filter(array[address_1, address_2], x-> x IS NOT NULL), ' '))
+     END AS address,  
+     LOWER(occupation) AS occupation, 
+     LOWER(employer) AS employer, 
+     first_name is null AS person 
+ FROM donors'''
+utils.athena_start_query(q)
+
+
+
+
+print('done')
diff --git a/athena_example/config.py b/athena_example/config.py
new file mode 100644
index 00000000..60964c73
--- /dev/null
+++ b/athena_example/config.py
@@ -0,0 +1,12 @@
+LOG_FILE = 'log.txt'
+# Connection parameters
+ACCESS_KEY_ID = None
+SECRET_ACCESS_KEY = None
+ATHENA_GARBAGE_PATH = 's3://com.ria.scratch/athena_garbage/'
+WORKGROUP = 'RIA'
+REGION = 'eu-west-1'
+DATABASE = 'ria_data_science_s3'
+
+# Database Parameters
+DATABASE_BUCKET = 'com.ria.scratch'
+DATABASE_ROOT_KEY = 'as-dedupe/'
diff --git a/athena_example/mysql.cnf_LOCAL b/athena_example/mysql.cnf_LOCAL
deleted file mode 100644
index 17bded3f..00000000
--- a/athena_example/mysql.cnf_LOCAL
+++ /dev/null
@@ -1,4 +0,0 @@
-[client]
-user = your_mysql_user
-password = your_mysql_password
-default-character-set=utf8
diff --git a/athena_example/mysql_example.py b/athena_example/mysql_example.py
deleted file mode 100644
index 5e257e13..00000000
--- a/athena_example/mysql_example.py
+++ /dev/null
@@ -1,344 +0,0 @@
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-
-"""
-This is an example of working with very large data. There are about
-700,000 unduplicated donors in this database of Illinois political
-campaign contributions.
-
-With such a large set of input data, we cannot store all the comparisons
-we need to make in memory. Instead, we will read the pairs on demand
-from the MySQL database.
-
-__Note:__ You will need to run `python mysql_init_db.py`
-before running this script. See the annotates source for
-[mysql_init_db.py](mysql_init_db.html)
-
-For smaller datasets (<10,000), see our
-[csv_example](csv_example.html)
-"""
-
-import os
-import itertools
-import time
-import logging
-import optparse
-import locale
-import json
-
-import MySQLdb
-import MySQLdb.cursors
-
-import dedupe
-import dedupe.backport
-
-
-def record_pairs(result_set):
-    for i, row in enumerate(result_set):
-        a_record_id, a_record, b_record_id, b_record = row
-        record_a = (a_record_id, json.loads(a_record))
-        record_b = (b_record_id, json.loads(b_record))
-
-        yield record_a, record_b
-
-        if i % 10000 == 0:
-            print(i)
-
-
-def cluster_ids(clustered_dupes):
-
-    for cluster, scores in clustered_dupes:
-        cluster_id = cluster[0]
-        for donor_id, score in zip(cluster, scores):
-            yield donor_id, cluster_id, score
-
-
-if __name__ == '__main__':
-
-    # ## Logging
-
-    # Dedupe uses Python logging to show or suppress verbose output. Added
-    # for convenience.  To enable verbose output, run `python
-    # examples/mysql_example/mysql_example.py -v`
-    optp = optparse.OptionParser()
-    optp.add_option('-v', '--verbose', dest='verbose', action='count',
-                    help='Increase verbosity (specify multiple times for more)'
-                    )
-    (opts, args) = optp.parse_args()
-    log_level = logging.WARNING
-    if opts.verbose:
-        if opts.verbose == 1:
-            log_level = logging.INFO
-        elif opts.verbose >= 2:
-            log_level = logging.DEBUG
-    logging.getLogger().setLevel(log_level)
-
-    # ## Setup
-    MYSQL_CNF = os.path.abspath('.') + '/mysql.cnf'
-
-    settings_file = 'mysql_example_settings'
-    training_file = 'mysql_example_training.json'
-
-    start_time = time.time()
-
-    # You'll need to copy `examples/mysql_example/mysql.cnf_LOCAL` to
-    # `examples/mysql_example/mysql.cnf` and fill in your mysql database
-    # information in `examples/mysql_example/mysql.cnf`
-
-    # We use Server Side cursors (SSDictCursor and SSCursor) to [avoid
-    # having to have enormous result sets in
-    # memory](http://stackoverflow.com/questions/1808150/how-to-efficiently-use-mysqldb-sscursor).
-    read_con = MySQLdb.connect(db='contributions',
-                               charset='utf8',
-                               read_default_file=MYSQL_CNF,
-                               cursorclass=MySQLdb.cursors.SSDictCursor)
-
-    write_con = MySQLdb.connect(db='contributions',
-                                charset='utf8',
-                                read_default_file=MYSQL_CNF)
-
-    # We'll be using variations on this following select statement to pull
-    # in campaign donor info.
-    #
-    # We did a fair amount of preprocessing of the fields in
-    # `mysql_init_db.py`
-
-    DONOR_SELECT = "SELECT donor_id, city, name, zip, state, address " \
-                   "from processed_donors"
-
-    # ## Training
-
-    if os.path.exists(settings_file):
-        print('reading from ', settings_file)
-        with open(settings_file, 'rb') as sf:
-            deduper = dedupe.StaticDedupe(sf, num_cores=4)
-    else:
-        # Define the fields dedupe will pay attention to
-        #
-        # The address, city, and zip fields are often missing, so we'll
-        # tell dedupe that, and we'll learn a model that take that into
-        # account
-        fields = [{'field': 'name', 'type': 'String'},
-                  {'field': 'address', 'type': 'String',
-                   'has missing': True},
-                  {'field': 'city', 'type': 'ShortString', 'has missing': True},
-                  {'field': 'state', 'type': 'ShortString', 'has missing': True},
-                  {'field': 'zip', 'type': 'ShortString', 'has missing': True},
-                  ]
-
-        # Create a new deduper object and pass our data model to it.
-        deduper = dedupe.Dedupe(fields, num_cores=4)
-
-        # We will sample pairs from the entire donor table for training
-        with read_con.cursor() as cur:
-            cur.execute(DONOR_SELECT)
-            temp_d = {i: row for i, row in enumerate(cur)}
-
-        # If we have training data saved from a previous run of dedupe,
-        # look for it an load it in.
-        #
-        # __Note:__ if you want to train from
-        # scratch, delete the training_file
-        if os.path.exists(training_file):
-            print('reading labeled examples from ', training_file)
-            with open(training_file) as tf:
-                deduper.prepare_training(temp_d, training_file=tf)
-        else:
-            deduper.prepare_training(temp_d)
-
-        del temp_d
-
-        # ## Active learning
-
-        print('starting active labeling...')
-        # Starts the training loop. Dedupe will find the next pair of records
-        # it is least certain about and ask you to label them as duplicates
-        # or not.
-
-        # use 'y', 'n' and 'u' keys to flag duplicates
-        # press 'f' when you are finished
-        dedupe.convenience.console_label(deduper)
-        # When finished, save our labeled, training pairs to disk
-        with open(training_file, 'w') as tf:
-            deduper.write_training(tf)
-
-        # Notice our the argument here
-        #
-        # `recall` is the proportion of true dupes pairs that the learned
-        # rules must cover. You may want to reduce this if your are making
-        # too many blocks and too many comparisons.
-        deduper.train(recall=0.90)
-
-        with open(settings_file, 'wb') as sf:
-            deduper.write_settings(sf)
-
-        # We can now remove some of the memory hobbing objects we used
-        # for training
-        deduper.cleanup_training()
-
-    # ## Blocking
-
-    print('blocking...')
-
-    # To run blocking on such a large set of data, we create a separate table
-    # that contains blocking keys and record ids
-    print('creating blocking_map database')
-    with write_con.cursor() as cur:
-        cur.execute("DROP TABLE IF EXISTS blocking_map")
-        cur.execute("CREATE TABLE blocking_map "
-                    "(block_key VARCHAR(200), donor_id INTEGER) "
-                    "CHARACTER SET utf8 COLLATE utf8_unicode_ci")
-
-    write_con.commit()
-
-    # If dedupe learned a Index Predicate, we have to take a pass
-    # through the data and create indices.
-    print('creating inverted index')
-
-    for field in deduper.fingerprinter.index_fields:
-        with read_con.cursor() as cur:
-            cur.execute("SELECT DISTINCT {field} FROM processed_donors "
-                        "WHERE {field} IS NOT NULL".format(field=field))
-            field_data = (row[0] for row in cur)
-            deduper.fingerprinter.index(field_data, field)
-
-    # Now we are ready to write our blocking map table by creating a
-    # generator that yields unique `(block_key, donor_id)` tuples.
-    print('writing blocking map')
-
-    with read_con.cursor() as read_cur:
-        read_cur.execute(DONOR_SELECT)
-        full_data = ((row['donor_id'], row) for row in read_cur)
-        b_data = deduper.fingerprinter(full_data)
-
-        with write_con.cursor() as write_cur:
-
-            write_cur.executemany("INSERT INTO blocking_map VALUES (%s, %s)",
-                                  b_data)
-
-    write_con.commit()
-
-    # Free up memory by removing indices we don't need anymore
-    deduper.fingerprinter.reset_indices()
-
-    # indexing blocking_map
-    print('creating index')
-    with write_con.cursor() as cur:
-        cur.execute("CREATE UNIQUE INDEX bm_idx ON blocking_map (block_key, donor_id)")
-
-    write_con.commit()
-    read_con.commit()
-
-    # select unique pairs to compare
-    with read_con.cursor(MySQLdb.cursors.SSCursor) as read_cur:
-
-        read_cur.execute("""
-               select a.donor_id,
-                      json_object('city', a.city,
-                                  'name', a.name,
-                                  'zip', a.zip,
-                                  'state', a.state,
-                                  'address', a.address),
-                      b.donor_id,
-                      json_object('city', b.city,
-                                  'name', b.name,
-                                  'zip', b.zip,
-                                  'state', b.state,
-                                  'address', b.address)
-               from (select DISTINCT l.donor_id as east, r.donor_id as west
-                     from blocking_map as l
-                     INNER JOIN blocking_map as r
-                     using (block_key)
-                     where l.donor_id < r.donor_id) ids
-               INNER JOIN processed_donors a on ids.east=a.donor_id
-               INNER JOIN processed_donors b on ids.west=b.donor_id
-               """)
-
-        # ## Clustering
-
-        print('clustering...')
-        clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)),
-                                          threshold=0.5)
-
-        with write_con.cursor() as write_cur:
-
-            # ## Writing out results
-
-            # We now have a sequence of tuples of donor ids that dedupe believes
-            # all refer to the same entity. We write this out onto an entity map
-            # table
-            write_cur.execute("DROP TABLE IF EXISTS entity_map")
-
-            print('creating entity_map database')
-            write_cur.execute("CREATE TABLE entity_map "
-                              "(donor_id INTEGER, canon_id INTEGER, "
-                              " cluster_score FLOAT, PRIMARY KEY(donor_id))")
-
-            write_cur.executemany('INSERT INTO entity_map VALUES (%s, %s, %s)',
-                                  cluster_ids(clustered_dupes))
-
-    write_con.commit()
-
-    with write_con.cursor() as cur:
-        cur.execute("CREATE INDEX head_index ON entity_map (canon_id)")
-
-    write_con.commit()
-    read_con.commit()
-
-    # Print out the number of duplicates found
-    print('# duplicate sets')
-
-    # ## Payoff
-
-    # With all this done, we can now begin to ask interesting questions
-    # of the data
-    #
-    # For example, let's see who the top 10 donors are.
-
-    locale.setlocale(locale.LC_ALL, '')  # for pretty printing numbers
-
-    with read_con.cursor() as cur:
-        # Create a temporary table so each group and unmatched record has
-        # a unique id
-        cur.execute("CREATE TEMPORARY TABLE e_map "
-                    "SELECT IFNULL(canon_id, donor_id) AS canon_id, donor_id "
-                    "FROM entity_map "
-                    "RIGHT JOIN donors USING(donor_id)")
-
-        cur.execute("SELECT CONCAT_WS(' ', donors.first_name, donors.last_name) AS name, "
-                    "donation_totals.totals AS totals "
-                    "FROM donors INNER JOIN "
-                    "(SELECT canon_id, SUM(amount) AS totals "
-                    " FROM contributions INNER JOIN e_map "
-                    " USING (donor_id) "
-                    " GROUP BY (canon_id) "
-                    " ORDER BY totals "
-                    " DESC LIMIT 10) "
-                    "AS donation_totals "
-                    "WHERE donors.donor_id = donation_totals.canon_id")
-
-        print("Top Donors (deduped)")
-        for row in cur:
-            row['totals'] = locale.currency(row['totals'], grouping=True)
-            print('%(totals)20s: %(name)s' % row)
-
-        # Compare this to what we would have gotten if we hadn't done any
-        # deduplication
-        cur.execute("SELECT CONCAT_WS(' ', donors.first_name, donors.last_name) as name, "
-                    "SUM(contributions.amount) AS totals "
-                    "FROM donors INNER JOIN contributions "
-                    "USING (donor_id) "
-                    "GROUP BY (donor_id) "
-                    "ORDER BY totals DESC "
-                    "LIMIT 10")
-
-        print("Top Donors (raw)")
-        for row in cur:
-            row['totals'] = locale.currency(row['totals'], grouping=True)
-            print('%(totals)20s: %(name)s' % row)
-
-        # Close our database connection
-    read_con.close()
-    write_con.close()
-
-    print('ran in', time.time() - start_time, 'seconds')
diff --git a/athena_example/mysql_init_db.py b/athena_example/mysql_init_db.py
deleted file mode 100644
index fcdc1256..00000000
--- a/athena_example/mysql_init_db.py
+++ /dev/null
@@ -1,234 +0,0 @@
-#!/usr/bin/python
-"""
-This is a setup script for mysql_example.  It downloads a zip file of
-Illinois campaign contributions and loads them into a MySQL database
-named 'contributions'.
- 
-__Note:__ You will need to run this script first before execuing
-[mysql_example.py](mysql_example.html).
- 
-Tables created:
-* raw_table - raw import of entire CSV file
-* donors - all distinct donors based on name and address
-* recipients - all distinct campaign contribution recipients
-* contributions - contribution amounts tied to donor and recipients tables
-"""
-
-import os
-import zipfile
-import warnings
-
-from urllib.request import urlopen
-
-import MySQLdb
-
-warnings.filterwarnings('ignore', category=MySQLdb.Warning)
-
-contributions_zip_file = 'Illinois-campaign-contributions.txt.zip'
-contributions_txt_file = 'Illinois-campaign-contributions.txt'
-
-if not os.path.exists(contributions_zip_file) :
-    print('downloading', contributions_zip_file, '(~60mb) ...')
-    u = urlopen('https://s3.amazonaws.com/dedupe-data/Illinois-campaign-contributions.txt.zip')
-    localFile = open(contributions_zip_file, 'wb')
-    localFile.write(u.read())
-    localFile.close()
-
-if not os.path.exists(contributions_txt_file) :
-    zip_file = zipfile.ZipFile(contributions_zip_file, 'r')
-    print('extracting %s' % contributions_zip_file)
-    zip_file_contents = zip_file.namelist()
-    for f in zip_file_contents:
-        if ('.txt' in f):
-            zip_file.extract(f)
-    zip_file.close()
-
-conn = MySQLdb.connect(read_default_file = os.path.abspath('.') + '/mysql.cnf', 
-                       local_infile = 1,
-                       sql_mode="ALLOW_INVALID_DATES",
-                       db='contributions')
-c = conn.cursor()
-
-print('importing raw data from csv...')
-c.execute("DROP TABLE IF EXISTS raw_table")
-c.execute("DROP TABLE IF EXISTS donors")
-c.execute("DROP TABLE IF EXISTS recipients")
-c.execute("DROP TABLE IF EXISTS contributions")
-c.execute("DROP TABLE IF EXISTS processed_donors")
-
-c.execute("CREATE TABLE raw_table "
-          "(reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), "
-          " address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), "
-          " state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), "
-          " date_recieved VARCHAR(10), loan_amount VARCHAR(12), "
-          " amount VARCHAR(23), receipt_type VARCHAR(23), "
-          " employer VARCHAR(70), occupation VARCHAR(40), "
-          " vendor_last_name VARCHAR(70), vendor_first_name VARCHAR(20), "
-          " vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), "
-          " vendor_city VARCHAR(20), vendor_state VARCHAR(10), "
-          " vendor_zip VARCHAR(10), description VARCHAR(90), "
-          " election_type VARCHAR(10), election_year VARCHAR(10), "
-          " report_period_begin VARCHAR(10), report_period_end VARCHAR(33), "
-          " committee_name VARCHAR(70), committee_id VARCHAR(37)) "
-          "CHARACTER SET utf8 COLLATE utf8_unicode_ci")
-
-
-conn.commit()
-
-c.execute("LOAD DATA LOCAL INFILE %s INTO TABLE raw_table "
-          "FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\r\n' " 
-          "IGNORE 1 LINES "
-          "(reciept_id, last_name, first_name, "
-          " address_1, address_2, city, state, "
-          " zip, report_type, date_recieved, "
-          " loan_amount, amount, receipt_type, "
-          " employer, occupation, vendor_last_name, "
-          " vendor_first_name, vendor_address_1, "
-          " vendor_address_2, vendor_city, vendor_state, "
-          " vendor_zip, description, election_type, "
-          " election_year, "
-          " report_period_begin, report_period_end, "
-          " committee_name, committee_id, @dummy)",
-          (contributions_txt_file,))
-
-# Remove the very few records that mess up the demo 
-# (demo purposes only! Don't do something like this in production)
-c.execute("DELETE FROM raw_table WHERE LENGTH(date_recieved) < 10")
-
-# set empty, non-zero, strings in date columns to null
-c.execute("UPDATE raw_table SET report_period_begin = NULL WHERE LENGTH(report_period_begin) < 10")
-c.execute("UPDATE raw_table SET report_period_end = NULL WHERE LENGTH(report_period_end) < 10")
-
-#committee ID is requred. Remove the 2 rows that don't have it.
-c.execute("DELETE FROM raw_table WHERE committee_id=''");
-
-# There's a record with a date stuck in the committee_id column, which causes
-# problems when inserting into the contributions table below. Get rid of it this 
-# way.
-c.execute("DELETE FROM raw_table WHERE LENGTH( committee_id ) > 9")
-conn.commit()
-
-
-
-print('creating donors table...')
-c.execute("CREATE TABLE donors "
-          "(donor_id INTEGER PRIMARY KEY AUTO_INCREMENT, "
-          " last_name VARCHAR(70), first_name VARCHAR(35), "
-          " address_1 VARCHAR(35), address_2 VARCHAR(36), "
-          " city VARCHAR(20), state VARCHAR(15), "
-          " zip VARCHAR(11), employer VARCHAR(70), "
-          " occupation VARCHAR(40)) "
-          "CHARACTER SET utf8 COLLATE utf8_unicode_ci")
-c.execute("INSERT INTO donors "
-          "(first_name, last_name, address_1,"
-          " address_2, city, state, zip, employer, occupation) "
-          "SELECT DISTINCT "
-          "TRIM(first_name), TRIM(last_name), TRIM(address_1),  "
-          "TRIM(address_2), TRIM(city), TRIM(state), TRIM(zip), "
-          "TRIM(employer), TRIM(occupation) "
-          "FROM raw_table")
-conn.commit()
-
-
-print('creating indexes on donors table')
-c.execute("CREATE INDEX donors_donor_info ON donors "
-          "(last_name, first_name, address_1, address_2, city, "
-          " state, zip)")
-conn.commit()
-
-
-
-print('creating recipients table...')
-c.execute("CREATE TABLE recipients "
-          "(recipient_id INTEGER PRIMARY KEY AUTO_INCREMENT, name VARCHAR(70)) "
-          "CHARACTER SET utf8 COLLATE utf8_unicode_ci")
-
-c.execute("INSERT IGNORE INTO recipients "
-          "SELECT DISTINCT committee_id, committee_name FROM raw_table")
-conn.commit()
-
-print('creating contributions table')
-c.execute("CREATE TABLE contributions "
-          "(contribution_id INT, donor_id INT, recipient_id INT, "
-          " report_type VARCHAR(24), date_recieved DATE, "
-          " loan_amount VARCHAR(12), amount VARCHAR(23), "
-          " receipt_type VARCHAR(23), "
-          " vendor_last_name VARCHAR(70), "
-          " vendor_first_name VARCHAR(20), "
-          " vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), "
-          " vendor_city VARCHAR(20), vendor_state VARCHAR(10), "
-          " vendor_zip VARCHAR(10), description VARCHAR(90), "
-          " election_type VARCHAR(10), election_year VARCHAR(10), "
-          " report_period_begin DATE, report_period_end DATE) "
-          "CHARACTER SET utf8 COLLATE utf8_unicode_ci")
-
-
-c.execute("INSERT INTO contributions "
-          "SELECT reciept_id, donors.donor_id, committee_id, "
-          " report_type, STR_TO_DATE(date_recieved, '%m/%d/%Y'), "
-          " loan_amount, amount, "
-          " receipt_type, vendor_last_name , "
-          " vendor_first_name, vendor_address_1, vendor_address_2, "
-          " vendor_city, vendor_state, vendor_zip, description, "
-          " election_type, election_year, "
-          " STR_TO_DATE(report_period_begin, '%m/%d/%Y'), "
-          " STR_TO_DATE(report_period_end, '%m/%d/%Y') "
-          "FROM raw_table JOIN donors ON "
-          "donors.first_name = TRIM(raw_table.first_name) AND "
-          "donors.last_name = TRIM(raw_table.last_name) AND "
-          "donors.address_1 = TRIM(raw_table.address_1) AND "
-          "donors.address_2 = TRIM(raw_table.address_2) AND "
-          "donors.city = TRIM(raw_table.city) AND "
-          "donors.state = TRIM(raw_table.state) AND "
-          "donors.employer = TRIM(raw_table.employer) AND "
-          "donors.occupation = TRIM(raw_table.occupation) AND "
-          "donors.zip = TRIM(raw_table.zip)")
-conn.commit()
-
-print('creating indexes on contributions')
-c.execute("ALTER TABLE contributions ADD PRIMARY KEY(contribution_id)")
-c.execute("CREATE INDEX donor_idx ON contributions (donor_id)")
-c.execute("CREATE INDEX recipient_idx ON contributions (recipient_id)")
-
-
-conn.commit()
-
-print('nullifying empty strings in donors')
-c.execute("UPDATE donors "
-          "SET "
-          "first_name = CASE first_name WHEN '' THEN NULL ELSE first_name END, "
-          "last_name = CASE last_name WHEN '' THEN NULL ELSE last_name END, "
-          "address_1 = CASE address_1 WHEN '' THEN NULL ELSE address_1 END, "
-          "address_2 = CASE address_2 WHEN '' THEN NULL ELSE address_2 END, "
-          "city = CASE city WHEN '' THEN NULL ELSE city END, "
-          "state = CASE state WHEN '' THEN NULL ELSE state END, "
-          "employer = CASE employer WHEN '' THEN NULL ELSE employer END, " 
-          "occupation = CASE occupation WHEN '' THEN NULL ELSE occupation END, " 
-          "zip = CASE zip WHEN '' THEN NULL ELSE zip END")
-
-
-conn.commit()
-
-c.execute("CREATE TABLE processed_donors AS " 
-          "(SELECT donor_id, " 
-          " LOWER(city) AS city, " 
-          " CASE WHEN (first_name IS NULL AND last_name IS NULL) "
-          "      THEN NULL "
-          "      ELSE LOWER(CONCAT_WS(' ', first_name, last_name)) "
-          " END AS name, " 
-          " LOWER(zip) AS zip, " 
-          " LOWER(state) AS state, " 
-          " CASE WHEN (address_1 IS NULL AND address_2 IS NULL) "
-          "      THEN NULL "
-          "      ELSE LOWER(CONCAT_WS(' ', address_1, address_2)) "
-          " END AS address, " 
-          " LOWER(occupation) AS occupation, "
-          " LOWER(employer) AS employer, "
-          " ISNULL(first_name) AS person "
-          " FROM donors)")
- 
-c.execute("CREATE INDEX donor_idx ON processed_donors (donor_id)")
-
-c.close()
-conn.close()
-print('done')
diff --git a/athena_example/utils.py b/athena_example/utils.py
new file mode 100644
index 00000000..77f18fda
--- /dev/null
+++ b/athena_example/utils.py
@@ -0,0 +1,138 @@
+from __future__ import print_function
+import re
+import boto3
+import botocore
+import sys
+import datetime
+import os
+import time
+import pandas as pd
+from six import string_types
+import sys
+pyver = sys.version_info[0]
+
+if pyver<3:
+    from StringIO import StringIO as SomethingIO
+    from urlparse import urlparse
+else:
+    from io import BytesIO as SomethingIO
+    from urllib.parse import urlparse
+    
+sys.path.insert(0, '../athena_example/')
+import config
+
+s3 = boto3.client('s3', region_name=config.REGION, 
+                      aws_access_key_id=config.ACCESS_KEY_ID, aws_secret_access_key=config.SECRET_ACCESS_KEY)
+  
+athena = boto3.client('athena', region_name=config.REGION, 
+                      aws_access_key_id=config.ACCESS_KEY_ID, aws_secret_access_key=config.SECRET_ACCESS_KEY)
+
+def athena_to_panda(query, database=config.DATABASE, output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, **kwargs):
+    query_execution_id = athena_start_query(query, database, output_location, region, workgroup, wait_until_finished=True)
+    df = pandas_read_csv(os.path.join(output_location, query_execution_id+'.csv'), **kwargs)
+    return df
+
+
+def athena_start_query(query, database=config.DATABASE, output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, wait_until_finished=True):
+    query_execution_id = athena.start_query_execution(
+        QueryString=query,
+        QueryExecutionContext={
+            'Database': database
+        },    
+        WorkGroup=workgroup,
+        ResultConfiguration={
+            "OutputLocation": output_location
+        }
+    )['QueryExecutionId']
+
+    seconds_to_wait = 1
+
+    if wait_until_finished:
+        while True:
+            time.sleep(seconds_to_wait)
+            seconds_to_wait += 1
+#             seconds_to_wait *= 2
+
+            execution = athena.get_query_execution(
+                QueryExecutionId=query_execution_id
+            )
+
+            if execution['QueryExecution']['Status']['State'] not in ['QUEUED', 'RUNNING']:
+                break
+
+        if execution['QueryExecution']['Status']['State'] != 'SUCCEEDED':
+            raise Exception("Athena query failed: %s" % ( execution['QueryExecution']['Status']['StateChangeReason'],), query_execution_id)
+
+    return query_execution_id
+
+# Copied from https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
+# Import it instead, when it's updated.
+def is_s3_url(url):
+    """Check for an s3, s3n, or s3a url"""
+    try:
+        return urlparse(url).scheme in ["s3", "s3n", "s3a"]
+    except Exception:
+        return False
+def seperate_bucket_key(url):
+    m = re.match('s3://([^/]+)/(.*)', url)
+    return m.group(1), m.group(2)
+
+def list_all(path):
+    if is_s3_url(path):
+        bucket, key = seperate_bucket_key(path)
+        objects = s3.list_objects_v2(Bucket=bucket, Prefix=key)
+        return [key['Key'] for key in objects['Contents']]
+    from os import listdir
+    from os.path import isfile, join
+    return listdir(path)
+    
+
+def pandas_read_csv(filepath_or_buffer, verbose=True, **kwargs):
+    bucket, key = seperate_bucket_key(filepath_or_buffer)
+    obj = s3.get_object(Bucket=bucket, Key=key)
+    return pd.read_csv(SomethingIO(obj['Body'].read()),  **kwargs)
+
+def read(filename, verbose=True):
+    log ("Reading {}".format(filename), verbose=verbose)
+    if is_s3_url(filename):
+        bucket, key = seperate_bucket_key(filename)
+        obj=s3.get_object(Bucket=bucket, Key=key)
+        return obj['Body'].read()
+    with open (filename) as f:
+        return f.read()
+
+def write(body, filename):
+    bucket, key = seperate_bucket_key(filename)
+    s3.put_object(Bucket=bucket, Key=key, Body=body)
+    return
+        
+    
+def file_exists(filename):
+    bucket, key = seperate_bucket_key(filename)
+    try:
+        s3.get_object(Bucket=bucket, Key=key)
+    except botocore.exceptions.ClientError as e:
+        if e.response['Error']['Code']=='NoSuchKey':
+            return False
+        else:
+            # Something else has gone wrong.
+            raise
+    else:
+        return True
+    
+    
+def log(outstr, logfile_name=config.LOG_FILE, timestamped=True, verbose=True, quiet=False):
+    if verbose == False:
+        return
+    if timestamped:
+        outstr = "[%s]\t%s\n" % (str(datetime.datetime.now()) , outstr)
+    else:
+        outstr = "%s\n" % (outstr,)
+
+    with open(logfile_name, "a") as logfile:
+        logfile.write(outstr)
+
+    if not quiet:
+        sys.stdout.write(outstr);
+        sys.stdout.flush()
+# Print iterations progress
diff --git a/notebooks/athena_example.ipynb b/notebooks/athena_example.ipynb
index 01c42392..ab222233 100644
--- a/notebooks/athena_example.ipynb
+++ b/notebooks/athena_example.ipynb
@@ -2,134 +2,9 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Collecting dedupe\n",
-      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/5e/09/179feb316147279c76ea7e6dc5a5f9e00a6feadaeda131d535247e580619/dedupe-2.0.3-cp36-cp36m-manylinux1_x86_64.whl (89kB)\n",
-      "\u001b[K    100% |████████████████████████████████| 92kB 239kB/s ta 0:00:011\n",
-      "\u001b[?25hCollecting pyathena\n",
-      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/40/85/f37c049922f5d47e9126d7817ef7b8fb7abb2e6a9ea0dd06adcbffc0e8bc/PyAthena-1.10.8-py2.py3-none-any.whl (53kB)\n",
-      "\u001b[K    100% |████████████████████████████████| 61kB 1.9MB/s ta 0:00:011\n",
-      "\u001b[?25hCollecting haversine>=0.4.1 (from dedupe)\n",
-      "  Downloading https://files.pythonhosted.org/packages/72/8e/6df8b563dd6b2961a36cd740b34c00b89142f1b97d92092c133379b2973f/haversine-2.2.0-py2.py3-none-any.whl\n",
-      "Collecting simplecosine>=1.2 (from dedupe)\n",
-      "  Downloading https://files.pythonhosted.org/packages/2d/22/6ea3a5ab8aea06d6563eb927e706f7342a00d1849c9be6143a2a7d84ddbd/simplecosine-1.2-py2.py3-none-any.whl\n",
-      "Collecting rlr>=2.4.3 (from dedupe)\n",
-      "  Downloading https://files.pythonhosted.org/packages/fa/02/3b1a9727a622ff4320919645ce35ceb887d90784d0bab41484756c33b7ea/rlr-2.4.5-py2.py3-none-any.whl\n",
-      "Collecting categorical-distance>=1.9 (from dedupe)\n",
-      "  Downloading https://files.pythonhosted.org/packages/1d/b7/4f97771f52c63916f4e4d349a644c2387961592e76070e7310463b2d70a5/categorical_distance-1.9-py3-none-any.whl\n",
-      "Requirement already satisfied: numpy>=1.13 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.14.3)\n",
-      "Collecting fastcluster (from dedupe)\n",
-      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/1e/9d/3d7525a4722ee4a11ad969762d1de53b6dac326b5ac1366221e06958e1d7/fastcluster-1.1.26-cp36-cp36m-manylinux1_x86_64.whl (154kB)\n",
-      "\u001b[K    100% |████████████████████████████████| 163kB 707kB/s ta 0:00:01\n",
-      "\u001b[?25hCollecting highered>=0.2.0 (from dedupe)\n",
-      "  Downloading https://files.pythonhosted.org/packages/81/00/cbd902cfd14ad1992fcdaa11a615d47b36b6136dc690e19b0afa58c7365d/highered-0.2.1-py2.py3-none-any.whl\n",
-      "Collecting dedupe-hcluster (from dedupe)\n",
-      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/b2/1f/c6f6075c2e988b3a1759fabaf91d2f8f2de59c6e607a3fd9a2e06112a0de/dedupe_hcluster-0.3.8-cp36-cp36m-manylinux1_x86_64.whl (531kB)\n",
-      "\u001b[K    100% |████████████████████████████████| 532kB 5.2MB/s ta 0:00:01\n",
-      "\u001b[?25hCollecting BTrees>=4.1.4 (from dedupe)\n",
-      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/48/b3/9ce3b32817db98e8bf20d6873e18ee3ee7feded135434d800b72bf8dfb9f/BTrees-4.7.2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)\n",
-      "\u001b[K    100% |████████████████████████████████| 3.0MB 8.2MB/s eta 0:00:01\n",
-      "\u001b[?25hCollecting Levenshtein-search (from dedupe)\n",
-      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/93/89/dc320196d10447540c95f58eab5dd316a2166310356c1d88b84724f4e793/Levenshtein_search-1.4.5-cp36-cp36m-manylinux1_x86_64.whl (59kB)\n",
-      "\u001b[K    100% |████████████████████████████████| 61kB 21.2MB/s ta 0:00:01\n",
-      "\u001b[?25hCollecting zope.index (from dedupe)\n",
-      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/ab/0f/f93bddfac1189bb6b973142da3ef2caa6817a59b07ca448095a30b644737/zope.index-5.0.0-cp36-cp36m-manylinux1_x86_64.whl (101kB)\n",
-      "\u001b[K    100% |████████████████████████████████| 102kB 17.6MB/s a 0:00:01\n",
-      "\u001b[?25hCollecting typing-extensions (from dedupe)\n",
-      "  Downloading https://files.pythonhosted.org/packages/0c/0e/3f026d0645d699e7320b59952146d56ad7c374e9cd72cd16e7c74e657a0f/typing_extensions-3.7.4.2-py3-none-any.whl\n",
-      "Collecting affinegap>=1.3 (from dedupe)\n",
-      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/b2/6a/91f5defe8178104449bc897208c9780b159575d16a959a5074f0bf39a6f0/affinegap-1.11-cp36-cp36m-manylinux1_x86_64.whl (45kB)\n",
-      "\u001b[K    100% |████████████████████████████████| 51kB 12.0MB/s ta 0:00:01\n",
-      "\u001b[?25hCollecting doublemetaphone (from dedupe)\n",
-      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/c0/27/8df369334aac64755ca899b9a7cc4d2d60e800cca148322ef19309cdae0f/DoubleMetaphone-0.1-cp36-cp36m-manylinux1_x86_64.whl (78kB)\n",
-      "\u001b[K    100% |████████████████████████████████| 81kB 3.4MB/s eta 0:00:01\n",
-      "\u001b[?25hCollecting dedupe-variable-datetime (from dedupe)\n",
-      "  Downloading https://files.pythonhosted.org/packages/65/8f/d21f6acadcdfd681ee038153883b5673b8b76f790e465d791780e6b7bf60/dedupe_variable_datetime-0.1.5-py3-none-any.whl\n",
-      "Collecting tenacity>=4.1.0 (from pyathena)\n",
-      "  Downloading https://files.pythonhosted.org/packages/b5/05/ff089032442058bd3386f9cd991cd88ccac81dca1494d78751621ee35e62/tenacity-6.2.0-py2.py3-none-any.whl\n",
-      "Requirement already satisfied: botocore>=1.5.52 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pyathena) (1.15.39)\n",
-      "Collecting future (from pyathena)\n",
-      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/45/0b/38b06fd9b92dc2b68d58b75f900e97884c45bedd2ff83203d933cf5851c9/future-0.18.2.tar.gz (829kB)\n",
-      "\u001b[K    100% |████████████████████████████████| 829kB 14.2MB/s ta 0:00:01\n",
-      "\u001b[?25hRequirement already satisfied: boto3>=1.4.4 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pyathena) (1.12.39)\n",
-      "Collecting pylbfgs (from rlr>=2.4.3->dedupe)\n",
-      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/b8/5b/b8e1ef62e5e5b034ce5ae919b64158ec8da4f64c995444aec7fd96e8ec42/PyLBFGS-0.2.0.13-cp36-cp36m-manylinux1_x86_64.whl (205kB)\n",
-      "\u001b[K    100% |████████████████████████████████| 215kB 16.4MB/s ta 0:00:01\n",
-      "\u001b[?25hCollecting pyhacrf-datamade>=0.2.0 (from highered>=0.2.0->dedupe)\n",
-      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/84/f5/971e17a8b6686d5fc3d562e29e9c902743eb5f0f4436880b86cb11c0149c/pyhacrf_datamade-0.2.5-cp36-cp36m-manylinux1_x86_64.whl (788kB)\n",
-      "\u001b[K    100% |████████████████████████████████| 798kB 14.5MB/s ta 0:00:01\n",
-      "\u001b[?25hCollecting zope.interface (from BTrees>=4.1.4->dedupe)\n",
-      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/fc/7e/8e1efcfa22b722a0d6e992172ab15a871988c290cb722fe8da6d11f1aeb2/zope.interface-5.1.0-cp36-cp36m-manylinux1_x86_64.whl (234kB)\n",
-      "\u001b[K    100% |████████████████████████████████| 235kB 16.6MB/s ta 0:00:01\n",
-      "\u001b[?25hCollecting persistent>=4.1.0 (from BTrees>=4.1.4->dedupe)\n",
-      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/2e/4e/9bde9a2f63273f2e63a94a8198781aac559cc6efd2f560d69afcb0d9d8b5/persistent-4.6.4-cp36-cp36m-manylinux1_x86_64.whl (246kB)\n",
-      "\u001b[K    100% |████████████████████████████████| 256kB 17.5MB/s ta 0:00:01\n",
-      "\u001b[?25hRequirement already satisfied: six in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from zope.index->dedupe) (1.11.0)\n",
-      "Requirement already satisfied: setuptools in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from zope.index->dedupe) (39.1.0)\n",
-      "Collecting datetime-distance (from dedupe-variable-datetime->dedupe)\n",
-      "  Downloading https://files.pythonhosted.org/packages/6b/98/a5eff9256ff27e3bb8030466dabd772002e5014b9237cbeb18c542050ff5/datetime_distance-0.1.3-py3-none-any.whl\n",
-      "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from botocore>=1.5.52->pyathena) (2.7.3)\n",
-      "Requirement already satisfied: docutils<0.16,>=0.10 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from botocore>=1.5.52->pyathena) (0.14)\n",
-      "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from botocore>=1.5.52->pyathena) (0.9.4)\n",
-      "Requirement already satisfied: urllib3<1.26,>=1.20; python_version != \"3.4\" in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from botocore>=1.5.52->pyathena) (1.23)\n",
-      "Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from boto3>=1.4.4->pyathena) (0.3.3)\n",
-      "Requirement already satisfied: cffi; platform_python_implementation == \"CPython\" in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from persistent>=4.1.0->BTrees>=4.1.4->dedupe) (1.11.5)\n",
-      "Requirement already satisfied: pycparser in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from cffi; platform_python_implementation == \"CPython\"->persistent>=4.1.0->BTrees>=4.1.4->dedupe) (2.18)\n",
-      "Building wheels for collected packages: future\n",
-      "  Running setup.py bdist_wheel for future ... \u001b[?25ldone\n",
-      "\u001b[?25h  Stored in directory: /home/ec2-user/.cache/pip/wheels/8b/99/a0/81daf51dcd359a9377b110a8a886b3895921802d2fc1b2397e\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Successfully built future\n",
-      "Installing collected packages: haversine, simplecosine, future, pylbfgs, rlr, categorical-distance, fastcluster, pyhacrf-datamade, highered, dedupe-hcluster, zope.interface, persistent, BTrees, Levenshtein-search, zope.index, typing-extensions, affinegap, doublemetaphone, datetime-distance, dedupe-variable-datetime, dedupe, tenacity, pyathena\n",
-      "Successfully installed BTrees-4.7.2 Levenshtein-search-1.4.5 affinegap-1.11 categorical-distance-1.9 datetime-distance-0.1.3 dedupe-2.0.3 dedupe-hcluster-0.3.8 dedupe-variable-datetime-0.1.5 doublemetaphone-0.1 fastcluster-1.1.26 future-0.18.2 haversine-2.2.0 highered-0.2.1 persistent-4.6.4 pyathena-1.10.8 pyhacrf-datamade-0.2.5 pylbfgs-0.2.0.13 rlr-2.4.5 simplecosine-1.2 tenacity-6.2.0 typing-extensions-3.7.4.2 zope.index-5.0.0 zope.interface-5.1.0\n",
-      "\u001b[33mYou are using pip version 10.0.1, however version 20.2b1 is available.\n",
-      "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n"
-     ]
-    }
-   ],
-   "source": [
-    "!pip install dedupe  pyathena"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "import sys\n",
-    "sys.path.insert(0, '../athena_example/')\n",
-    "import config\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "AttributeError",
-     "evalue": "module 'logging' has no attribute 'logging'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-3-878b2de91830>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     85\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     86\u001b[0m \u001b[0;31m## Armin\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 87\u001b[0;31m     \u001b[0mlog_level\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlogging\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogging\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDEBUG\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     88\u001b[0m \u001b[0;31m#######\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     89\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mAttributeError\u001b[0m: module 'logging' has no attribute 'logging'"
-     ]
-    }
-   ],
    "source": [
     "# %load ../mysql_example/mysql_example.py\n",
     "#!/usr/bin/python\n",
@@ -151,6 +26,12 @@
     "[csv_example](csv_example.html)\n",
     "\"\"\"\n",
     "\n",
+    "# There is a little bit difference between the result \n",
+    "# of this module and the mysql one. The reason is due to\n",
+    "# Some special (and mostly erroneous) characters, such as \\a .. \n",
+    "# Which are dealt with differently by mysql and athena/panda\n",
+    "\n",
+    "import sys\n",
     "import os\n",
     "import itertools\n",
     "import time\n",
@@ -162,19 +43,17 @@
     "import csv\n",
     "import pandas as pd\n",
     "\n",
-    "# import MySQLdb\n",
-    "# import MySQLdb.cursors\n",
-    "\n",
     "import boto3\n",
-    "from pyathena import connect\n",
-    "from pyathena.pandas_cursor import PandasCursor\n",
     "import dedupe\n",
     "import dedupe.backport\n",
+    "sys.path.insert(0, '../athena_example/')\n",
+    "import config\n",
+    "sys.path.insert(0, '../athena_example/')\n",
+    "import utils\n",
     "\n",
-    "def dict_cursor_execute(cur, query):\n",
-    "    df = cur.execute(query).as_pandas()\n",
-    "    return df.where(pd.notnull(df), None).astype(str)\n",
-    "\n",
+    "def as_pandas(query, **kwrgs):\n",
+    "    df = utils.athena_to_panda(query, escapechar=None, keep_default_na=False, na_values=[''], **kwrgs)\n",
+    "    return df.where(pd.notnull(df), None)\n",
     "\n",
     "def record_pairs(result_set):\n",
     "    for i, row in enumerate(result_set):\n",
@@ -204,67 +83,28 @@
     "    # for convenience.  To enable verbose output, run `python\n",
     "    # examples/mysql_example/mysql_example.py -v`\n",
     "    \n",
-    "#     optp = optparse.OptionParser()\n",
-    "#     optp.add_option('-v', '--verbose', dest='verbose', action='count',\n",
-    "#                     help='Increase verbosity (specify multiple times for more)'\n",
-    "#                     )\n",
-    "#     (opts, args) = optp.parse_args()\n",
-    "#     log_level = logging.WARNING\n",
-    "#     if opts.verbose:\n",
-    "#         if opts.verbose == 1:\n",
-    "#             log_level = logging.INFO\n",
-    "#         elif opts.verbose >= 2:\n",
-    "#             log_level = logging.DEBUG\n",
-    "\n",
-    "## Armin\n",
-    "    log_level = logging.DEBUG\n",
-    "#######\n",
+    "    optp = optparse.OptionParser()\n",
+    "    optp.add_option('-v', '--verbose', dest='verbose', action='count',\n",
+    "                    help='Increase verbosity (specify multiple times for more)'\n",
+    "                    )\n",
+    "    (opts, args) = optp.parse_args()\n",
+    "    log_level = logging.WARNING\n",
+    "    if opts.verbose:\n",
+    "        if opts.verbose == 1:\n",
+    "            log_level = logging.INFO\n",
+    "        elif opts.verbose >= 2:\n",
+    "            log_level = logging.DEBUG\n",
+    "\n",
     "\n",
     "    logging.getLogger().setLevel(log_level)\n",
     "\n",
     "    \n",
     "\n",
-    "#     # ## Setup\n",
-    "#     MYSQL_CNF = os.path.abspath('.') + '/mysql.cnf'\n",
     "\n",
     "    settings_file = 'mysql_example_settings'\n",
     "    training_file = 'mysql_example_training.json'\n",
     "\n",
-    "    start_time = time.time()\n",
-    "\n",
-    "    # You'll need to copy `examples/mysql_example/mysql.cnf_LOCAL` to\n",
-    "    # `examples/mysql_example/mysql.cnf` and fill in your mysql database\n",
-    "    # information in `examples/mysql_example/mysql.cnf`\n",
-    "\n",
-    "    # We use Server Side cursors (SSDictCursor and SSCursor) to [avoid\n",
-    "    # having to have enormous result sets in\n",
-    "    # memory](http://stackoverflow.com/questions/1808150/how-to-efficiently-use-mysqldb-sscursor).\n",
-    "#     read_con = MySQLdb.connect(db='contributions',\n",
-    "#                                charset='utf8',\n",
-    "#                                read_default_file=MYSQL_CNF,\n",
-    "#                                cursorclass=MySQLdb.cursors.SSDictCursor)\n",
-    "\n",
-    "#     write_con = MySQLdb.connect(db='contributions',\n",
-    "#                                 charset='utf8',\n",
-    "#                                 read_default_file=MYSQL_CNF)\n",
-    "\n",
-    "    s3 = boto3.client('s3')  \n",
-    "    conn = connect(aws_access_key_id=config.ACCESS_KEY_ID,\n",
-    "                   aws_secret_access_key=config.SECRET_ACCESS_KEY,\n",
-    "                   s3_staging_dir=config.ATHENA_GARBAGE_PATH,\n",
-    "                   region_name=config.REGION, \n",
-    "                   work_group=config.WORKGROUP)    \n",
-    "    cur = conn.cursor(PandasCursor, schema_name=config.SCHEMA_NAME)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# !rm 'mysql_example_settings'\n",
-    "# !rm 'mysql_example_training.json'"
+    "    start_time = time.time()"
    ]
   },
   {
@@ -306,17 +146,13 @@
     "\n",
     "        # We will sample pairs from the entire donor table for training\n",
     "#         with read_con.cursor() as cur:\n",
-    "#         cur.execute(DONOR_SELECT)\n",
-    "#         temp_d = {i: row for i, row in enumerate(cur)}\n",
     "\n",
     "        # Armin: The problem is the donor_id, it's numpy's int64, should be converted to int! \n",
-    "        # But for that, astype doesn't work, and a loot on temp_d is slow, so for now let's just use str\n",
-    "        with conn.cursor(PandasCursor, schema_name=schema_name) as cursor:\n",
-    "        #     Something like this is much faster, but let's  keep the changes minimal for now\n",
-    "        #     df = cur.execute(DONOR_SELECT).as_pandas().astype(str)\n",
-    "        #     temp_d = df.where(pd.notnull(df), None).to_dict('index')\n",
-    "            cursor_df = dict_cursor_execute(cursor, DONOR_SELECT)\n",
-    "            temp_d = cursor_df.to_dict('index')\n",
+    "        # But for that, astype doesn't work, and a loop on temp_d is slow, so for now let's just use str\n",
+    "#         with conn.cursor(PandasCursor, schema_name=schema_name) as cursor:\n",
+    "        temp_df = as_pandas(DONOR_SELECT)\n",
+    "        temp_d = temp_df.to_dict('index')\n",
+    "            \n",
     "\n",
     "        # If we have training data saved from a previous run of dedupe,\n",
     "        # look for it an load it in.\n",
@@ -374,14 +210,7 @@
     "    # To run blocking on such a large set of data, we create a separate table\n",
     "    # that contains blocking keys and record ids\n",
     "    print('creating blocking_map database')\n",
-    "#     with write_con.cursor() as cur:\n",
-    "#         cur.execute(\"DROP TABLE IF EXISTS blocking_map\")\n",
-    "#         cur.execute(\"CREATE TABLE blocking_map \"\n",
-    "#                     \"(block_key VARCHAR(200), donor_id INTEGER) \"\n",
-    "#                     \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n",
-    "\n",
-    "#     write_con.commit()\n",
-    "    cur.execute(\"DROP TABLE IF EXISTS blocking_map\")\n",
+    "    utils.athena_start_query(\"DROP TABLE IF EXISTS blocking_map\")\n",
     "\n",
     "    q='''\n",
     "    CREATE EXTERNAL TABLE blocking_map     \n",
@@ -396,7 +225,7 @@
     "        --'skip.header.line.count'='1',  \n",
     "        'serialization.null.format'='')\n",
     "    '''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'blocking_map') \n",
-    "    cur.execute(q)"
+    "    utils.athena_start_query(q)"
    ]
   },
   {
@@ -409,12 +238,14 @@
     "    # through the data and create indices.\n",
     "    print('creating inverted index')\n",
     "\n",
+    "    # Armin: \n",
+    "    # This never runs, index_fields is empty, possible bug?\n",
     "    for field in deduper.fingerprinter.index_fields:\n",
     "        q = '''\n",
     "        SELECT DISTINCT {field} FROM processed_donors \n",
     "        WHERE {field} IS NOT NULL\n",
     "        '''.format(field=field)\n",
-    "        cur_df = dict_cursor_execute(cur, q)\n",
+    "        cur_df = as_pandas(q)\n",
     "        # Do I need to cast it as a list?\n",
     "        field_data = cur_df[field]\n",
     "        deduper.fingerprinter.index(field_data, field)\n",
@@ -432,17 +263,10 @@
     "    print('writing blocking map')\n",
     "    \n",
     "\n",
-    "    read_cur_dict = dict_cursor_execute(cur, DONOR_SELECT).to_dict('records')\n",
+    "    read_cur_dict = as_pandas(DONOR_SELECT).to_dict('records')\n",
     "    full_data = ((row['donor_id'], row) for row in read_cur_dict)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -450,16 +274,7 @@
    "outputs": [],
    "source": [
     "    b_data = deduper.fingerprinter(full_data)\n",
-    "    buffer = pd.DataFrame.from_records(b_data).to_csv(index=False, header=False, sep='\\t')\n",
-    "#         csv_out.writerows(b_data)        \n",
-    "\n",
-    "#         \"\\n\".join(b_data)\n",
-    "#         with write_con.cursor() as write_cur:\n",
-    "\n",
-    "#             write_cur.executemany(\"INSERT INTO blocking_map VALUES (%s, %s)\",\n",
-    "#                                   b_data)\n",
-    "    s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'blocking_map/blocking.csv', Body=buffer)    \n",
-    "#     write_con.commit()"
+    "    buffer = pd.DataFrame.from_records(b_data).to_csv(index=False, header=False, sep='\\t')    utils.s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'blocking_map/blocking.csv', Body=buffer)    \n"
    ]
   },
   {
@@ -468,13 +283,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "    # indexing blocking_map\n",
-    "#     print('creating index')\n",
-    "#     with write_con.cursor() as cur:\n",
-    "#         cur.execute(\"CREATE UNIQUE INDEX bm_idx ON blocking_map (block_key, donor_id)\")\n",
-    "\n",
-    "#     write_con.commit()\n",
-    "#     read_con.commit()\n",
     "\n",
     "    # select unique pairs to compare\n",
     "    q='''\n",
@@ -494,7 +302,7 @@
     "    INNER JOIN processed_donors a on ids.east=a.donor_id\n",
     "    INNER JOIN processed_donors b on ids.west=b.donor_id\n",
     "    '''\n",
-    "    read_cur_dict=dict_cursor_execute(cur, q).itertuples(index=False, name=False)"
+    "    read_cur_dict=as_pandas(q).itertuples(index=False, name=None)"
    ]
   },
   {
@@ -507,7 +315,7 @@
     "\n",
     "    print('clustering...')\n",
     "    clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur_dict)),\n",
-    "                                          threshold=0.5)"
+    "                                      threshold=0.5)"
    ]
   },
   {
@@ -516,7 +324,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "    cur.execute(\"DROP TABLE IF EXISTS entity_map\")\n",
+    "    utils.athena_start_query(\"DROP TABLE IF EXISTS entity_map\")\n",
     "\n",
     "    print('creating entity_map database')\n",
     "    q='''\n",
@@ -532,12 +340,11 @@
     "        'classification'='csv', \n",
     "        --'skip.header.line.count'='1',  \n",
     "        'serialization.null.format'='')\n",
-    "    '''.format(bucket, root_key+'entity_map') \n",
-    "    cur.execute(q) \n",
+    "    '''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'entity_map') \n",
+    "    utils.athena_start_query(q) \n",
     "\n",
     "    buffer = pd.DataFrame.from_records(cluster_ids(clustered_dupes)).to_csv(index=False, header=False, sep='\\t')\n",
-    "    s3.put_object(Bucket=bucket, Key=root_key+'entity_map/entity_map.csv', Body=buffer)    \n",
-    "\n"
+    "    utils.s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'entity_map/entity_map.csv', Body=buffer)    \n"
    ]
   },
   {
@@ -556,10 +363,9 @@
     "    #\n",
     "    # For example, let's see who the top 10 donors are.\n",
     "\n",
-    "    locale.setlocale(locale.LC_ALL, '')  # for pretty printing numbers\n",
+    "    locale.setlocale(locale.LC_ALL, 'en_CA.UTF-8')  # for pretty printing numbers\n",
     "    \n",
-    "    cur.execute(\"DROP TABLE IF EXISTS e_map\")\n",
-    "\n",
+    "    utils.athena_start_query(\"DROP TABLE IF EXISTS e_map\")\n",
     "    q = '''\n",
     "    CREATE TABLE e_map as \n",
     "        SELECT COALESCE(canon_id, entity_map.donor_id) AS canon_id, entity_map.donor_id \n",
@@ -567,7 +373,7 @@
     "            RIGHT JOIN donors USING(donor_id)\n",
     "    '''\n",
     "    \n",
-    "    cur.execute(q)\n",
+    "    utils.athena_start_query(q)\n",
     "    q ='''\n",
     "    SELECT array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name,   \n",
     "        donation_totals.totals AS totals \n",
@@ -580,38 +386,38 @@
     "        DESC LIMIT 10) \n",
     "        AS donation_totals \n",
     "    ON donors.donor_id = donation_totals.canon_id\n",
+    "    ORDER BY totals DESC\n",
     "    '''\n",
-    "    cur_dict = dict_cursor_execute(cur, q).to_dict('records')\n",
+    "    cur_dict = as_pandas(q).to_dict('records')\n",
     "\n",
     "    print(\"Top Donors (deduped)\")\n",
     "    for row in cur_dict:\n",
     "        row['totals'] = locale.currency(row['totals'], grouping=True)\n",
-    "        print('%(totals)20s: %(name)s' % row)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "        print('%(totals)20s: %(name)s' % row)\n",
+    "\n",
     "    # Compare this to what we would have gotten if we hadn't done any\n",
     "    # deduplication\n",
     "\n",
     "    q = '''\n",
-    "    SELECT array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name,\n",
-    "        SUM(cast(contributions.amount as double)) AS totals \n",
-    "    FROM donors INNER JOIN contributions \n",
-    "        USING (donor_id) \n",
-    "    GROUP BY donor_id), name\n",
+    "    with donorscontributions as(\n",
+    "\n",
+    "        SELECT donors.donor_id, \n",
+    "            array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name,\n",
+    "            cast(contributions.amount as double) as amount\n",
+    "        FROM donors INNER JOIN contributions \n",
+    "            USING (donor_id) \n",
+    "    )\n",
+    "    SELECT name, sum(amount) AS totals  \n",
+    "    FROM donorscontributions\n",
+    "    GROUP BY donor_id, name\n",
     "    ORDER BY totals DESC \n",
-    "    LIMIT 10\")\n",
+    "    LIMIT 10\n",
     "    '''\n",
     "\n",
-    "    cur_dict = dict_cursor_execute(cur, q).to_dict('records')\n",
+    "    cur_dict = as_pandas(q).to_dict('records')\n",
     "\n",
     "    print(\"Top Donors (raw)\")\n",
-    "    for row in cur:\n",
+    "    for row in cur_dict:\n",
     "        row['totals'] = locale.currency(row['totals'], grouping=True)\n",
     "        print('%(totals)20s: %(name)s' % row)\n",
     "\n",
@@ -622,73 +428,22 @@
     "    print('ran in', time.time() - start_time, 'seconds')"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# I'm here\n",
-    "Found a way to map block_key to block_numbers\n",
-    "** CREATE TABLE, according to some thing online, has more timeout!\n",
-    "** Looks like i should be using (bucketing)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Problem:\n",
-    "The athena mapping doesn't have many distinct values, a huge number for example have 6061:None:2, while there is only one like this in sql!?\n",
-    "The problem, probably was probably address, the concat was buggy and there were too many nulls.\n",
-    "Still while raw table matches, donors don't! The athena is too much bigger\n",
-    "Start from here: Run this query on both, the results are different"
-   ]
-  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "metadata": {},
-   "outputs": [],
-   "source": [
-    "create table as_blocking_map_number\n",
-    "with (bucketed_by = block_number)\n",
-    "as( \n",
-    "    SELECT donor_id, dense_rank() over (ORDER BY block_key) as block_number\n",
-    "    from blocking_map)\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NbConvertApp] Converting notebook athena_example.ipynb to script\n",
+      "[NbConvertApp] Writing 11731 bytes to ../athena_example/athena_example.py\n"
+     ]
+    }
+   ],
    "source": [
-    "%%time\n",
-    "import sys\n",
-    "sys.path.insert(0, '../athena_example/')\n",
-    "from pyathena import connect\n",
-    "from pyathena.pandas_cursor import PandasCursor\n",
-    "\n",
-    "import config\n",
-    "\n",
-    "conn = connect(aws_access_key_id=config.ACCESS_KEY_ID,\n",
-    "               aws_secret_access_key=config.SECRET_ACCESS_KEY,\n",
-    "               s3_staging_dir=config.ATHENA_GARBAGE_PATH,\n",
-    "               region_name=config.REGION, \n",
-    "               work_group=config.WORKGROUP)    \n",
-    "cur = conn.cursor(PandasCursor, schema_name=config.SCHEMA_NAME)\n",
-    "q='''\n",
-    "with blocking_map_number as( \n",
-    "    SELECT donor_id, dense_rank() over (ORDER BY block_key) as block_number\n",
-    "    from blocking_map)\n",
-    "create table donor_id_pairs as (\n",
-    "    SELECT DISTINCT l.donor_id as east, r.donor_id as west\n",
-    "    from blocking_map_number as l\n",
-    "    INNER JOIN blocking_map_number as r\n",
-    "    using (block_number)\n",
-    "    where l.donor_id < r.donor_id)\n",
-    "'''\n",
-    "cur.execute(q)"
+    "!jupyter nbconvert --to script athena_example.ipynb --output-dir=../athena_example/"
    ]
   },
   {
@@ -716,6 +471,13 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.6.10"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "state": {},
+    "version_major": 2,
+    "version_minor": 0
+   }
   }
  },
  "nbformat": 4,
diff --git a/notebooks/athena_init_db.ipynb b/notebooks/athena_init_db.ipynb
index 5e8a5a32..47e75969 100644
--- a/notebooks/athena_init_db.ipynb
+++ b/notebooks/athena_init_db.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -15,13 +15,14 @@
    ],
    "source": [
     "%%writefile ../athena_example/config.py\n",
+    "LOG_FILE = 'log.txt'\n",
     "# Connection parameters\n",
     "ACCESS_KEY_ID = None\n",
     "SECRET_ACCESS_KEY = None\n",
     "ATHENA_GARBAGE_PATH = 's3://com.ria.scratch/athena_garbage/'\n",
     "WORKGROUP = 'RIA'\n",
     "REGION = 'eu-west-1'\n",
-    "SCHEMA_NAME = 'ria_data_science_s3'\n",
+    "DATABASE = 'ria_data_science_s3'\n",
     "\n",
     "# Database Parameters\n",
     "DATABASE_BUCKET = 'com.ria.scratch'\n",
@@ -30,19 +31,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Overwriting ../athena_example/athena_example.py\n"
+      "Writing ../athena_example/athena_init.py\n"
      ]
     }
    ],
    "source": [
-    "%%writefile ../athena_example/athena_example.py\n",
+    "%%writefile ../athena_example/athena_init.py\n",
     "#!/usr/bin/python\n",
     "\"\"\"\n",
     "This is a setup script for athena_example.  It downloads a zip file of\n",
@@ -66,9 +67,11 @@
     "import numpy as np\n",
     "from urllib.request import urlopen\n",
     "import boto3\n",
-    "from pyathena import connect\n",
     "import config\n",
     "import csv\n",
+    "import sys\n",
+    "sys.path.insert(0, '../athena_example/')\n",
+    "import utils\n",
     "\n",
     "\n",
     "contributions_zip_file = 'Illinois-campaign-contributions.txt.zip'\n",
@@ -92,15 +95,6 @@
     "\n",
     "\n",
     "\n",
-    "def as_pandas(query, **kwrgs):\n",
-    "    return utils.athena_to_panda(query, escapechar='\\\\', dtype='object', keep_default_na=False, na_values=[''], **kwrgs)\n",
-    "\n",
-    "conn = connect(aws_access_key_id=config.ACCESS_KEY_ID,\n",
-    "               aws_secret_access_key=config.SECRET_ACCESS_KEY,\n",
-    "               s3_staging_dir=config.ATHENA_GARBAGE_PATH,\n",
-    "               region_name=config.REGION, \n",
-    "               work_group=config.WORKGROUP)\n",
-    "c = conn.cursor(schema_name=config.SCHEMA_NAME)\n",
     "\n",
     "print('importing raw data from csv...')\n",
     "utils.athena_start_query(\"DROP TABLE IF EXISTS raw_table\")\n",
@@ -127,8 +121,8 @@
     "    committee_name VARCHAR(70), committee_id VARCHAR(37)) \n",
     "ROW FORMAT DELIMITED\n",
     "  FIELDS TERMINATED BY '\\t'\n",
-    "  LINES TERMINATED BY '\\n'  \n",
     "  ESCAPED BY '\\\\'\n",
+    "  LINES TERMINATED BY '\\n'  \n",
     "LOCATION\n",
     "    's3://{}/{}' \n",
     "TBLPROPERTIES (\n",
@@ -164,20 +158,28 @@
     "\n",
     "# Nullifying empty strings\n",
     "# df = df.replace(r'^\\s*$', np.nan, regex=True)\n",
-    "df_lower=df.apply(lambda x: x.str.lower() if x.dtype=='object' else x, result_type='expand')\n",
+    "df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand')\n",
+    "\n",
     "utils.write(body=df_lower.to_csv(quoting=csv.QUOTE_NONE, sep=\"\\t\", escapechar='\\\\', index=None),\n",
     "           filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'raw_table', contributions_txt_file,))\n",
     "\n",
+    "# Athena is doesn't equate empty string and null, eventhough in the table spec we said so\n",
+    "# Not that it's a bug, it works if the string is null in the source, but not after applying trim to it\n",
+    "# So we need to manually take care of that\n",
     "print('creating donors table...')\n",
     "q='''\n",
     "CREATE TABLE donors as\n",
     "    with tmp as\n",
     "      (SELECT DISTINCT \n",
-    "           TRIM(last_name) as last_name, TRIM(first_name) as first_name, \n",
-    "           TRIM(address_1) as address_1, TRIM(address_2) as address_2, \n",
-    "           TRIM(city) city, TRIM(state) as state, \n",
-    "           TRIM(zip) as zip, TRIM(employer) as employer, \n",
-    "           TRIM(occupation) as occupation\n",
+    "           NULLIF(TRIM(last_name), '') as last_name, \n",
+    "           NULLIF(TRIM(first_name), '') as first_name, \n",
+    "           NULLIF(TRIM(address_1), '') as address_1, \n",
+    "           NULLIF(TRIM(address_2), '') as address_2, \n",
+    "           NULLIF(TRIM(city), '') city, \n",
+    "           NULLIF(TRIM(state), '') as state, \n",
+    "           NULLIF(TRIM(zip), '') as zip, \n",
+    "           NULLIF(TRIM(employer), '') as employer, \n",
+    "           NULLIF(TRIM(occupation), '') as occupation\n",
     "      FROM raw_table)\n",
     "    SELECT row_number() over () as donor_id, * from tmp'''\n",
     "utils.athena_start_query(q)\n",
@@ -185,14 +187,33 @@
     "\n",
     "q='''\n",
     "CREATE TABLE recipients as\n",
-    "    SELECT DISTINCT committee_id, committee_name FROM raw_table\n",
+    "    SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM raw_table\n",
     "'''\n",
     "utils.athena_start_query(q)\n",
     "\n",
     "print('creating contributions table')\n",
+    "\n",
+    "# --\n",
+    "# c.execute(\"CREATE TABLE contributions \"\n",
+    "#           \"(contribution_id INT, donor_id INT, recipient_id INT, \"\n",
+    "#           \" report_type VARCHAR(24), date_recieved DATE, \"\n",
+    "#           \" loan_amount VARCHAR(12), amount VARCHAR(23), \"\n",
+    "#           \" receipt_type VARCHAR(23), \"\n",
+    "#           \" vendor_last_name VARCHAR(70), \"\n",
+    "#           \" vendor_first_name VARCHAR(20), \"\n",
+    "#           \" vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), \"\n",
+    "#           \" vendor_city VARCHAR(20), vendor_state VARCHAR(10), \"\n",
+    "#           \" vendor_zip VARCHAR(10), description VARCHAR(90), \"\n",
+    "#           \" election_type VARCHAR(10), election_year VARCHAR(10), \"\n",
+    "#           \" report_period_begin DATE, report_period_end DATE) \"\n",
+    "#           \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n",
+    "# --\n",
+    "\n",
     "q='''\n",
     "CREATE TABLE contributions as\n",
-    "    SELECT reciept_id, donors.donor_id, committee_id, \n",
+    "    SELECT reciept_id as contribution_id, \n",
+    "        donors.donor_id as donor_id , \n",
+    "        committee_id as recipient_id, \n",
     "        report_type, date_parse(date_recieved, '%m/%d/%Y') as date_recieved, \n",
     "        loan_amount, amount, \n",
     "        receipt_type, vendor_last_name , \n",
@@ -202,15 +223,16 @@
     "        date_parse(report_period_begin, '%m/%d/%Y') as report_period_begin, \n",
     "        date_parse(report_period_end, '%m/%d/%Y') as report_period_end \n",
     "    FROM raw_table JOIN donors ON \n",
-    "        donors.first_name = TRIM(raw_table.first_name) AND \n",
-    "        donors.last_name = TRIM(raw_table.last_name) AND \n",
-    "        donors.address_1 = TRIM(raw_table.address_1) AND \n",
-    "        donors.address_2 = TRIM(raw_table.address_2) AND \n",
-    "        donors.city = TRIM(raw_table.city) AND \n",
-    "        donors.state = TRIM(raw_table.state) AND \n",
-    "        donors.employer = TRIM(raw_table.employer) AND \n",
-    "        donors.occupation = TRIM(raw_table.occupation) AND \n",
-    "        donors.zip = TRIM(raw_table.zip)'''\n",
+    "        coalesce(donors.first_name, '') = coalesce(TRIM(raw_table.first_name), '') AND \n",
+    "        coalesce(donors.last_name, '') = coalesce(TRIM(raw_table.last_name), '') AND \n",
+    "        coalesce(donors.address_1, '') = coalesce(TRIM(raw_table.address_1), '') AND \n",
+    "        coalesce(donors.address_2, '') = coalesce(TRIM(raw_table.address_2), '') AND \n",
+    "        coalesce(donors.city, '') = coalesce(TRIM(raw_table.city), '') AND \n",
+    "        coalesce(donors.state, '') = coalesce(TRIM(raw_table.state), '') AND \n",
+    "        coalesce(donors.employer, '') = coalesce(TRIM(raw_table.employer), '') AND \n",
+    "        coalesce(donors.occupation , '')= coalesce(TRIM(raw_table.occupation), '') AND \n",
+    "        coalesce(donors.zip, '') = coalesce(TRIM(raw_table.zip), '')'''\n",
+    "\n",
     "utils.athena_start_query(q)\n",
     "\n",
     "q = '''\n",
@@ -225,7 +247,7 @@
     "     LOWER(state) AS state,  \n",
     "     CASE WHEN (address_1 IS NULL AND address_2 IS NULL) \n",
     "          THEN NULL \n",
-    "          ELSE LOWER(array_join(filter(array[address_1, address_1], x-> x IS NOT NULL), ' '))\n",
+    "          ELSE LOWER(array_join(filter(array[address_1, address_2], x-> x IS NOT NULL), ' '))\n",
     "     END AS address,  \n",
     "     LOWER(occupation) AS occupation, \n",
     "     LOWER(employer) AS employer, \n",
@@ -241,7 +263,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -254,11 +276,6 @@
       "b'Skipping line 1495732: expected 30 fields, saw 31\\n'\n",
       "b'Skipping line 1631504: expected 30 fields, saw 31\\nSkipping line 1631506: expected 30 fields, saw 31\\n'\n",
       "b'Skipping line 1660260: expected 30 fields, saw 31\\nSkipping line 1660264: expected 30 fields, saw 32\\n'\n",
-      "b'Skipping line 1441352: expected 30 fields, saw 31\\n'\n",
-      "b'Skipping line 1465996: expected 30 fields, saw 31\\n'\n",
-      "b'Skipping line 1495732: expected 30 fields, saw 31\\n'\n",
-      "b'Skipping line 1631504: expected 30 fields, saw 31\\nSkipping line 1631506: expected 30 fields, saw 31\\n'\n",
-      "b'Skipping line 1660260: expected 30 fields, saw 31\\nSkipping line 1660264: expected 30 fields, saw 32\\n'\n",
       "creating donors table...\n",
       "creating contributions table\n",
       "done\n"
@@ -266,7 +283,7 @@
     }
    ],
    "source": [
-    "!python ../athena_example/athena_example.py"
+    "!python ../athena_example/athena_init.py"
    ]
   },
   {

From da660eb8f8ecbc7cccbaac7b463119b175c7dec3 Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-10-10-30-187.eu-west-1.compute.internal>
Date: Thu, 29 Oct 2020 15:17:02 +0000
Subject: [PATCH 08/19] checkpoint

---
 athena_example/athena_init.py  |  52 ++++++++--------
 athena_example/config.py       |   8 +--
 athena_example/utils.py        |   1 +
 notebooks/athena_example.ipynb | 107 +++++++++++++++++----------------
 notebooks/athena_init_db.ipynb |  92 +++++++++++-----------------
 5 files changed, 120 insertions(+), 140 deletions(-)

diff --git a/athena_example/athena_init.py b/athena_example/athena_init.py
index 9ddb14c8..f8bac6e0 100644
--- a/athena_example/athena_init.py
+++ b/athena_example/athena_init.py
@@ -8,7 +8,7 @@
 [athena_example.py](athena_example.py).
  
 Tables created:
-* raw_table - raw import of entire CSV file
+* as_raw_table - raw import of entire CSV file
 * donors - all distinct donors based on name and address
 * recipients - all distinct campaign contribution recipients
 * contributions - contribution amounts tied to donor and recipients tables
@@ -51,15 +51,15 @@
 
 
 print('importing raw data from csv...')
-utils.athena_start_query("DROP TABLE IF EXISTS raw_table")
-utils.athena_start_query("DROP TABLE IF EXISTS donors")
-utils.athena_start_query("DROP TABLE IF EXISTS recipients")
-utils.athena_start_query("DROP TABLE IF EXISTS contributions")
-utils.athena_start_query("DROP TABLE IF EXISTS processed_donors")
+utils.athena_start_query("DROP TABLE IF EXISTS as_raw_table")
+utils.athena_start_query("DROP TABLE IF EXISTS as_donors")
+utils.athena_start_query("DROP TABLE IF EXISTS as_recipients")
+utils.athena_start_query("DROP TABLE IF EXISTS as_contributions")
+utils.athena_start_query("DROP TABLE IF EXISTS as_processed_donors")
 
 
 q=r'''
-CREATE EXTERNAL TABLE raw_table 
+CREATE EXTERNAL TABLE as_raw_table 
     (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), 
     address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), 
     state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), 
@@ -83,7 +83,7 @@
     'classification'='csv', 
     'skip.header.line.count'='1',  
     'serialization.null.format'='')
-'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'raw_table') 
+'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table') 
 utils.athena_start_query(q)
 
 
@@ -115,14 +115,14 @@
 df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand')
 
 utils.write(body=df_lower.to_csv(quoting=csv.QUOTE_NONE, sep="\t", escapechar='\\', index=None),
-           filename=os.path.join("s3://", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'raw_table', contributions_txt_file,))
+           filename=os.path.join("s3://", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'as_raw_table', contributions_txt_file,))
 
 # Athena is doesn't equate empty string and null, eventhough in the table spec we said so
 # Not that it's a bug, it works if the string is null in the source, but not after applying trim to it
 # So we need to manually take care of that
 print('creating donors table...')
 q='''
-CREATE TABLE donors as
+CREATE TABLE as_donors as
     with tmp as
       (SELECT DISTINCT 
            NULLIF(TRIM(last_name), '') as last_name, 
@@ -134,14 +134,14 @@
            NULLIF(TRIM(zip), '') as zip, 
            NULLIF(TRIM(employer), '') as employer, 
            NULLIF(TRIM(occupation), '') as occupation
-      FROM raw_table)
+      FROM as_raw_table)
     SELECT row_number() over () as donor_id, * from tmp'''
 utils.athena_start_query(q)
 
 
 q='''
-CREATE TABLE recipients as
-    SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM raw_table
+CREATE TABLE as_recipients as
+    SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM as_raw_table
 '''
 utils.athena_start_query(q)
 
@@ -164,7 +164,7 @@
 # --
 
 q='''
-CREATE TABLE contributions as
+CREATE TABLE as_contributions as
     SELECT reciept_id as contribution_id, 
         donors.donor_id as donor_id , 
         committee_id as recipient_id, 
@@ -176,21 +176,21 @@
         election_type, election_year, 
         date_parse(report_period_begin, '%m/%d/%Y') as report_period_begin, 
         date_parse(report_period_end, '%m/%d/%Y') as report_period_end 
-    FROM raw_table JOIN donors ON 
-        coalesce(donors.first_name, '') = coalesce(TRIM(raw_table.first_name), '') AND 
-        coalesce(donors.last_name, '') = coalesce(TRIM(raw_table.last_name), '') AND 
-        coalesce(donors.address_1, '') = coalesce(TRIM(raw_table.address_1), '') AND 
-        coalesce(donors.address_2, '') = coalesce(TRIM(raw_table.address_2), '') AND 
-        coalesce(donors.city, '') = coalesce(TRIM(raw_table.city), '') AND 
-        coalesce(donors.state, '') = coalesce(TRIM(raw_table.state), '') AND 
-        coalesce(donors.employer, '') = coalesce(TRIM(raw_table.employer), '') AND 
-        coalesce(donors.occupation , '')= coalesce(TRIM(raw_table.occupation), '') AND 
-        coalesce(donors.zip, '') = coalesce(TRIM(raw_table.zip), '')'''
+    FROM as_raw_table JOIN as_donors donors ON 
+        coalesce(donors.first_name, '') = coalesce(TRIM(as_raw_table.first_name), '') AND 
+        coalesce(donors.last_name, '') = coalesce(TRIM(as_raw_table.last_name), '') AND 
+        coalesce(donors.address_1, '') = coalesce(TRIM(as_raw_table.address_1), '') AND 
+        coalesce(donors.address_2, '') = coalesce(TRIM(as_raw_table.address_2), '') AND 
+        coalesce(donors.city, '') = coalesce(TRIM(as_raw_table.city), '') AND 
+        coalesce(donors.state, '') = coalesce(TRIM(as_raw_table.state), '') AND 
+        coalesce(donors.employer, '') = coalesce(TRIM(as_raw_table.employer), '') AND 
+        coalesce(donors.occupation , '')= coalesce(TRIM(as_raw_table.occupation), '') AND 
+        coalesce(donors.zip, '') = coalesce(TRIM(as_raw_table.zip), '')'''
 
 utils.athena_start_query(q)
 
 q = '''
-CREATE TABLE processed_donors AS  
+CREATE TABLE as_processed_donors AS  
     SELECT donor_id,  
      LOWER(city) AS city,  
      CASE WHEN (first_name IS NULL AND last_name IS NULL) 
@@ -206,7 +206,7 @@
      LOWER(occupation) AS occupation, 
      LOWER(employer) AS employer, 
      first_name is null AS person 
- FROM donors'''
+ FROM as_donors'''
 utils.athena_start_query(q)
 
 
diff --git a/athena_example/config.py b/athena_example/config.py
index 60964c73..3715b750 100644
--- a/athena_example/config.py
+++ b/athena_example/config.py
@@ -2,11 +2,11 @@
 # Connection parameters
 ACCESS_KEY_ID = None
 SECRET_ACCESS_KEY = None
-ATHENA_GARBAGE_PATH = 's3://com.ria.scratch/athena_garbage/'
-WORKGROUP = 'RIA'
+ATHENA_GARBAGE_PATH = 's3://aws-athena-query-results-rds'
+WORKGROUP = 'RDS'
 REGION = 'eu-west-1'
-DATABASE = 'ria_data_science_s3'
+DATABASE = 'ria_tmp'
 
 # Database Parameters
-DATABASE_BUCKET = 'com.ria.scratch'
+DATABASE_BUCKET = 'ria-temp'
 DATABASE_ROOT_KEY = 'as-dedupe/'
diff --git a/athena_example/utils.py b/athena_example/utils.py
index 77f18fda..1b8b935a 100644
--- a/athena_example/utils.py
+++ b/athena_example/utils.py
@@ -73,6 +73,7 @@ def is_s3_url(url):
         return urlparse(url).scheme in ["s3", "s3n", "s3a"]
     except Exception:
         return False
+    
 def seperate_bucket_key(url):
     m = re.match('s3://([^/]+)/(.*)', url)
     return m.group(1), m.group(2)
diff --git a/notebooks/athena_example.ipynb b/notebooks/athena_example.ipynb
index ab222233..da896697 100644
--- a/notebooks/athena_example.ipynb
+++ b/notebooks/athena_example.ipynb
@@ -1,5 +1,14 @@
 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install dedupe"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -75,7 +84,8 @@
     "            yield donor_id, cluster_id, score\n",
     "\n",
     "\n",
-    "if __name__ == '__main__':\n",
+    "# if __name__ == '__main__':\n",
+    "if True:\n",
     "\n",
     "    # ## Logging\n",
     "\n",
@@ -83,17 +93,17 @@
     "    # for convenience.  To enable verbose output, run `python\n",
     "    # examples/mysql_example/mysql_example.py -v`\n",
     "    \n",
-    "    optp = optparse.OptionParser()\n",
-    "    optp.add_option('-v', '--verbose', dest='verbose', action='count',\n",
-    "                    help='Increase verbosity (specify multiple times for more)'\n",
-    "                    )\n",
-    "    (opts, args) = optp.parse_args()\n",
+    "#     optp = optparse.OptionParser()\n",
+    "#     optp.add_option('-v', '--verbose', dest='verbose', action='count',\n",
+    "#                     help='Increase verbosity (specify multiple times for more)'\n",
+    "#                     )\n",
+    "#     (opts, args) = optp.parse_args()\n",
     "    log_level = logging.WARNING\n",
-    "    if opts.verbose:\n",
-    "        if opts.verbose == 1:\n",
-    "            log_level = logging.INFO\n",
-    "        elif opts.verbose >= 2:\n",
-    "            log_level = logging.DEBUG\n",
+    "#     if opts.verbose:\n",
+    "#         if opts.verbose == 1:\n",
+    "#             log_level = logging.INFO\n",
+    "#         elif opts.verbose >= 2:\n",
+    "#             log_level = logging.DEBUG\n",
     "\n",
     "\n",
     "    logging.getLogger().setLevel(log_level)\n",
@@ -119,7 +129,7 @@
     "    # We did a fair amount of preprocessing of the fields in\n",
     "    # `mysql_init_db.py`    \n",
     "    DONOR_SELECT = \"SELECT donor_id, city, name, zip, state, address \" \\\n",
-    "                   \"from processed_donors\"\n",
+    "                   \"from as_processed_donors\"\n",
     "\n",
     "    # ## Training\n",
     "\n",
@@ -209,11 +219,11 @@
     "\n",
     "    # To run blocking on such a large set of data, we create a separate table\n",
     "    # that contains blocking keys and record ids\n",
-    "    print('creating blocking_map database')\n",
-    "    utils.athena_start_query(\"DROP TABLE IF EXISTS blocking_map\")\n",
+    "    print('creating as_blocking_map database')\n",
+    "    utils.athena_start_query(\"DROP TABLE IF EXISTS as_blocking_map\")\n",
     "\n",
     "    q='''\n",
-    "    CREATE EXTERNAL TABLE blocking_map     \n",
+    "    CREATE EXTERNAL TABLE as_blocking_map     \n",
     "        (block_key VARCHAR(200), donor_id INTEGER)\n",
     "    ROW FORMAT DELIMITED\n",
     "      FIELDS TERMINATED BY '\\t'\n",
@@ -224,7 +234,7 @@
     "        'classification'='csv', \n",
     "        --'skip.header.line.count'='1',  \n",
     "        'serialization.null.format'='')\n",
-    "    '''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'blocking_map') \n",
+    "    '''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map') \n",
     "    utils.athena_start_query(q)"
    ]
   },
@@ -242,7 +252,7 @@
     "    # This never runs, index_fields is empty, possible bug?\n",
     "    for field in deduper.fingerprinter.index_fields:\n",
     "        q = '''\n",
-    "        SELECT DISTINCT {field} FROM processed_donors \n",
+    "        SELECT DISTINCT {field} FROM as_processed_donors \n",
     "        WHERE {field} IS NOT NULL\n",
     "        '''.format(field=field)\n",
     "        cur_df = as_pandas(q)\n",
@@ -274,7 +284,8 @@
    "outputs": [],
    "source": [
     "    b_data = deduper.fingerprinter(full_data)\n",
-    "    buffer = pd.DataFrame.from_records(b_data).to_csv(index=False, header=False, sep='\\t')    utils.s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'blocking_map/blocking.csv', Body=buffer)    \n"
+    "    buffer = pd.DataFrame.from_records(b_data).to_csv(index=False, header=False, sep='\\t')\n",
+    "    utils.s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'as_blocking_map/blocking.csv', Body=buffer)    \n"
    ]
   },
   {
@@ -295,12 +306,12 @@
     "                  ARRAY[ b.city, b.name, b.zip, b.state, b.address])\n",
     "              AS JSON))\n",
     "    FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west\n",
-    "         from blocking_map as l\n",
-    "         INNER JOIN blocking_map as r\n",
+    "         from as_blocking_map as l\n",
+    "         INNER JOIN as_blocking_map as r\n",
     "         using (block_key)\n",
     "         where l.donor_id < r.donor_id) ids\n",
-    "    INNER JOIN processed_donors a on ids.east=a.donor_id\n",
-    "    INNER JOIN processed_donors b on ids.west=b.donor_id\n",
+    "    INNER JOIN as_processed_donors a on ids.east=a.donor_id\n",
+    "    INNER JOIN as_processed_donors b on ids.west=b.donor_id\n",
     "    '''\n",
     "    read_cur_dict=as_pandas(q).itertuples(index=False, name=None)"
    ]
@@ -324,11 +335,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "    utils.athena_start_query(\"DROP TABLE IF EXISTS entity_map\")\n",
+    "    utils.athena_start_query(\"DROP TABLE IF EXISTS as_entity_map\")\n",
     "\n",
-    "    print('creating entity_map database')\n",
+    "    print('creating as_entity_map database')\n",
     "    q='''\n",
-    "    CREATE EXTERNAL TABLE entity_map     \n",
+    "    CREATE EXTERNAL TABLE as_entity_map     \n",
     "        (donor_id INTEGER, canon_id INTEGER, \n",
     "         cluster_score FLOAT)\n",
     "    ROW FORMAT DELIMITED\n",
@@ -340,11 +351,11 @@
     "        'classification'='csv', \n",
     "        --'skip.header.line.count'='1',  \n",
     "        'serialization.null.format'='')\n",
-    "    '''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'entity_map') \n",
+    "    '''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map') \n",
     "    utils.athena_start_query(q) \n",
     "\n",
     "    buffer = pd.DataFrame.from_records(cluster_ids(clustered_dupes)).to_csv(index=False, header=False, sep='\\t')\n",
-    "    utils.s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'entity_map/entity_map.csv', Body=buffer)    \n"
+    "    utils.s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'as_entity_map/as_entity_map.csv', Body=buffer)    \n"
    ]
   },
   {
@@ -365,27 +376,28 @@
     "\n",
     "    locale.setlocale(locale.LC_ALL, 'en_CA.UTF-8')  # for pretty printing numbers\n",
     "    \n",
-    "    utils.athena_start_query(\"DROP TABLE IF EXISTS e_map\")\n",
+    "    utils.athena_start_query(\"DROP TABLE IF EXISTS as_e_map\")\n",
     "    q = '''\n",
-    "    CREATE TABLE e_map as \n",
-    "        SELECT COALESCE(canon_id, entity_map.donor_id) AS canon_id, entity_map.donor_id \n",
-    "        FROM entity_map \n",
-    "            RIGHT JOIN donors USING(donor_id)\n",
+    "    CREATE TABLE as_e_map as \n",
+    "        SELECT COALESCE(canon_id, as_entity_map.donor_id) AS canon_id, as_entity_map.donor_id \n",
+    "        FROM as_entity_map \n",
+    "            RIGHT JOIN as_donors USING(donor_id)\n",
+    "        \n",
     "    '''\n",
     "    \n",
     "    utils.athena_start_query(q)\n",
     "    q ='''\n",
-    "    SELECT array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name,   \n",
+    "    SELECT array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,   \n",
     "        donation_totals.totals AS totals \n",
-    "    FROM donors INNER JOIN \n",
+    "    FROM as_donors INNER JOIN \n",
     "        (SELECT canon_id, SUM(cast (amount as double)) AS totals \n",
-    "        FROM contributions INNER JOIN e_map \n",
+    "        FROM as_contributions INNER JOIN as_e_map \n",
     "        USING (donor_id) \n",
     "        GROUP BY (canon_id) \n",
     "        ORDER BY totals \n",
     "        DESC LIMIT 10) \n",
     "        AS donation_totals \n",
-    "    ON donors.donor_id = donation_totals.canon_id\n",
+    "    ON as_donors.donor_id = donation_totals.canon_id\n",
     "    ORDER BY totals DESC\n",
     "    '''\n",
     "    cur_dict = as_pandas(q).to_dict('records')\n",
@@ -401,10 +413,10 @@
     "    q = '''\n",
     "    with donorscontributions as(\n",
     "\n",
-    "        SELECT donors.donor_id, \n",
-    "            array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name,\n",
-    "            cast(contributions.amount as double) as amount\n",
-    "        FROM donors INNER JOIN contributions \n",
+    "        SELECT as_donors.donor_id, \n",
+    "            array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,\n",
+    "            cast(as_contributions.amount as double) as amount\n",
+    "        FROM as_donors INNER JOIN as_contributions \n",
     "            USING (donor_id) \n",
     "    )\n",
     "    SELECT name, sum(amount) AS totals  \n",
@@ -430,20 +442,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[NbConvertApp] Converting notebook athena_example.ipynb to script\n",
-      "[NbConvertApp] Writing 11731 bytes to ../athena_example/athena_example.py\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "!jupyter nbconvert --to script athena_example.ipynb --output-dir=../athena_example/"
+    "# !jupyter nbconvert --to script athena_example.ipynb --output-dir=../athena_example/"
    ]
   },
   {
diff --git a/notebooks/athena_init_db.ipynb b/notebooks/athena_init_db.ipynb
index 47e75969..19e6f600 100644
--- a/notebooks/athena_init_db.ipynb
+++ b/notebooks/athena_init_db.ipynb
@@ -2,30 +2,22 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Overwriting ../athena_example/config.py\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "%%writefile ../athena_example/config.py\n",
     "LOG_FILE = 'log.txt'\n",
     "# Connection parameters\n",
     "ACCESS_KEY_ID = None\n",
     "SECRET_ACCESS_KEY = None\n",
-    "ATHENA_GARBAGE_PATH = 's3://com.ria.scratch/athena_garbage/'\n",
-    "WORKGROUP = 'RIA'\n",
+    "ATHENA_GARBAGE_PATH = 's3://aws-athena-query-results-rds'\n",
+    "WORKGROUP = 'RDS'\n",
     "REGION = 'eu-west-1'\n",
-    "DATABASE = 'ria_data_science_s3'\n",
+    "DATABASE = 'ria_tmp'\n",
     "\n",
     "# Database Parameters\n",
-    "DATABASE_BUCKET = 'com.ria.scratch'\n",
+    "DATABASE_BUCKET = 'ria-temp'\n",
     "DATABASE_ROOT_KEY = 'as-dedupe/'"
    ]
   },
@@ -38,7 +30,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Writing ../athena_example/athena_init.py\n"
+      "Overwriting ../athena_example/athena_init.py\n"
      ]
     }
    ],
@@ -54,7 +46,7 @@
     "[athena_example.py](athena_example.py).\n",
     " \n",
     "Tables created:\n",
-    "* raw_table - raw import of entire CSV file\n",
+    "* as_raw_table - raw import of entire CSV file\n",
     "* donors - all distinct donors based on name and address\n",
     "* recipients - all distinct campaign contribution recipients\n",
     "* contributions - contribution amounts tied to donor and recipients tables\n",
@@ -97,15 +89,15 @@
     "\n",
     "\n",
     "print('importing raw data from csv...')\n",
-    "utils.athena_start_query(\"DROP TABLE IF EXISTS raw_table\")\n",
-    "utils.athena_start_query(\"DROP TABLE IF EXISTS donors\")\n",
-    "utils.athena_start_query(\"DROP TABLE IF EXISTS recipients\")\n",
-    "utils.athena_start_query(\"DROP TABLE IF EXISTS contributions\")\n",
-    "utils.athena_start_query(\"DROP TABLE IF EXISTS processed_donors\")\n",
+    "utils.athena_start_query(\"DROP TABLE IF EXISTS as_raw_table\")\n",
+    "utils.athena_start_query(\"DROP TABLE IF EXISTS as_donors\")\n",
+    "utils.athena_start_query(\"DROP TABLE IF EXISTS as_recipients\")\n",
+    "utils.athena_start_query(\"DROP TABLE IF EXISTS as_contributions\")\n",
+    "utils.athena_start_query(\"DROP TABLE IF EXISTS as_processed_donors\")\n",
     "\n",
     "\n",
     "q=r'''\n",
-    "CREATE EXTERNAL TABLE raw_table \n",
+    "CREATE EXTERNAL TABLE as_raw_table \n",
     "    (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), \n",
     "    address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), \n",
     "    state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), \n",
@@ -129,7 +121,7 @@
     "    'classification'='csv', \n",
     "    'skip.header.line.count'='1',  \n",
     "    'serialization.null.format'='')\n",
-    "'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'raw_table') \n",
+    "'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table') \n",
     "utils.athena_start_query(q)\n",
     "\n",
     "\n",
@@ -161,14 +153,14 @@
     "df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand')\n",
     "\n",
     "utils.write(body=df_lower.to_csv(quoting=csv.QUOTE_NONE, sep=\"\\t\", escapechar='\\\\', index=None),\n",
-    "           filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'raw_table', contributions_txt_file,))\n",
+    "           filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'as_raw_table', contributions_txt_file,))\n",
     "\n",
     "# Athena is doesn't equate empty string and null, eventhough in the table spec we said so\n",
     "# Not that it's a bug, it works if the string is null in the source, but not after applying trim to it\n",
     "# So we need to manually take care of that\n",
     "print('creating donors table...')\n",
     "q='''\n",
-    "CREATE TABLE donors as\n",
+    "CREATE TABLE as_donors as\n",
     "    with tmp as\n",
     "      (SELECT DISTINCT \n",
     "           NULLIF(TRIM(last_name), '') as last_name, \n",
@@ -180,14 +172,14 @@
     "           NULLIF(TRIM(zip), '') as zip, \n",
     "           NULLIF(TRIM(employer), '') as employer, \n",
     "           NULLIF(TRIM(occupation), '') as occupation\n",
-    "      FROM raw_table)\n",
+    "      FROM as_raw_table)\n",
     "    SELECT row_number() over () as donor_id, * from tmp'''\n",
     "utils.athena_start_query(q)\n",
     "\n",
     "\n",
     "q='''\n",
-    "CREATE TABLE recipients as\n",
-    "    SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM raw_table\n",
+    "CREATE TABLE as_recipients as\n",
+    "    SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM as_raw_table\n",
     "'''\n",
     "utils.athena_start_query(q)\n",
     "\n",
@@ -210,7 +202,7 @@
     "# --\n",
     "\n",
     "q='''\n",
-    "CREATE TABLE contributions as\n",
+    "CREATE TABLE as_contributions as\n",
     "    SELECT reciept_id as contribution_id, \n",
     "        donors.donor_id as donor_id , \n",
     "        committee_id as recipient_id, \n",
@@ -222,21 +214,21 @@
     "        election_type, election_year, \n",
     "        date_parse(report_period_begin, '%m/%d/%Y') as report_period_begin, \n",
     "        date_parse(report_period_end, '%m/%d/%Y') as report_period_end \n",
-    "    FROM raw_table JOIN donors ON \n",
-    "        coalesce(donors.first_name, '') = coalesce(TRIM(raw_table.first_name), '') AND \n",
-    "        coalesce(donors.last_name, '') = coalesce(TRIM(raw_table.last_name), '') AND \n",
-    "        coalesce(donors.address_1, '') = coalesce(TRIM(raw_table.address_1), '') AND \n",
-    "        coalesce(donors.address_2, '') = coalesce(TRIM(raw_table.address_2), '') AND \n",
-    "        coalesce(donors.city, '') = coalesce(TRIM(raw_table.city), '') AND \n",
-    "        coalesce(donors.state, '') = coalesce(TRIM(raw_table.state), '') AND \n",
-    "        coalesce(donors.employer, '') = coalesce(TRIM(raw_table.employer), '') AND \n",
-    "        coalesce(donors.occupation , '')= coalesce(TRIM(raw_table.occupation), '') AND \n",
-    "        coalesce(donors.zip, '') = coalesce(TRIM(raw_table.zip), '')'''\n",
+    "    FROM as_raw_table JOIN as_donors donors ON \n",
+    "        coalesce(donors.first_name, '') = coalesce(TRIM(as_raw_table.first_name), '') AND \n",
+    "        coalesce(donors.last_name, '') = coalesce(TRIM(as_raw_table.last_name), '') AND \n",
+    "        coalesce(donors.address_1, '') = coalesce(TRIM(as_raw_table.address_1), '') AND \n",
+    "        coalesce(donors.address_2, '') = coalesce(TRIM(as_raw_table.address_2), '') AND \n",
+    "        coalesce(donors.city, '') = coalesce(TRIM(as_raw_table.city), '') AND \n",
+    "        coalesce(donors.state, '') = coalesce(TRIM(as_raw_table.state), '') AND \n",
+    "        coalesce(donors.employer, '') = coalesce(TRIM(as_raw_table.employer), '') AND \n",
+    "        coalesce(donors.occupation , '')= coalesce(TRIM(as_raw_table.occupation), '') AND \n",
+    "        coalesce(donors.zip, '') = coalesce(TRIM(as_raw_table.zip), '')'''\n",
     "\n",
     "utils.athena_start_query(q)\n",
     "\n",
     "q = '''\n",
-    "CREATE TABLE processed_donors AS  \n",
+    "CREATE TABLE as_processed_donors AS  \n",
     "    SELECT donor_id,  \n",
     "     LOWER(city) AS city,  \n",
     "     CASE WHEN (first_name IS NULL AND last_name IS NULL) \n",
@@ -252,7 +244,7 @@
     "     LOWER(occupation) AS occupation, \n",
     "     LOWER(employer) AS employer, \n",
     "     first_name is null AS person \n",
-    " FROM donors'''\n",
+    " FROM as_donors'''\n",
     "utils.athena_start_query(q)\n",
     "\n",
     "\n",
@@ -265,23 +257,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "importing raw data from csv...\n",
-      "b'Skipping line 1441352: expected 30 fields, saw 31\\n'\n",
-      "b'Skipping line 1465996: expected 30 fields, saw 31\\n'\n",
-      "b'Skipping line 1495732: expected 30 fields, saw 31\\n'\n",
-      "b'Skipping line 1631504: expected 30 fields, saw 31\\nSkipping line 1631506: expected 30 fields, saw 31\\n'\n",
-      "b'Skipping line 1660260: expected 30 fields, saw 31\\nSkipping line 1660264: expected 30 fields, saw 32\\n'\n",
-      "creating donors table...\n",
-      "creating contributions table\n",
-      "done\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "!python ../athena_example/athena_init.py"
    ]

From d5f807a9e053c46d8564a9fe35cf3f2e25dda5da Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-10-10-18-123.eu-west-1.compute.internal>
Date: Fri, 30 Oct 2020 20:48:10 +0000
Subject: [PATCH 09/19] probably working

---
 athena_example/athena_init.py               |   44 +-
 athena_example/{utils.py => athenautils.py} |   70 +-
 athena_example/config.py                    |    3 +-
 notebooks/athena_example.ipynb              | 1025 ++++++++++++++++---
 notebooks/athena_init_db.ipynb              |   81 +-
 5 files changed, 1031 insertions(+), 192 deletions(-)
 rename athena_example/{utils.py => athenautils.py} (61%)

diff --git a/athena_example/athena_init.py b/athena_example/athena_init.py
index f8bac6e0..c8b5b3ea 100644
--- a/athena_example/athena_init.py
+++ b/athena_example/athena_init.py
@@ -25,7 +25,7 @@
 import csv
 import sys
 sys.path.insert(0, '../athena_example/')
-import utils
+import athenautils
 
 
 contributions_zip_file = 'Illinois-campaign-contributions.txt.zip'
@@ -51,14 +51,14 @@
 
 
 print('importing raw data from csv...')
-utils.athena_start_query("DROP TABLE IF EXISTS as_raw_table")
-utils.athena_start_query("DROP TABLE IF EXISTS as_donors")
-utils.athena_start_query("DROP TABLE IF EXISTS as_recipients")
-utils.athena_start_query("DROP TABLE IF EXISTS as_contributions")
-utils.athena_start_query("DROP TABLE IF EXISTS as_processed_donors")
+athenautils.athena_start_query("DROP TABLE IF EXISTS as_raw_table", database=config.DATABASE)
+athenautils.athena_start_query("DROP TABLE IF EXISTS as_donors", database=config.DATABASE)
+athenautils.athena_start_query("DROP TABLE IF EXISTS as_recipients", database=config.DATABASE)
+athenautils.athena_start_query("DROP TABLE IF EXISTS as_contributions", database=config.DATABASE)
+athenautils.athena_start_query("DROP TABLE IF EXISTS as_processed_donors", database=config.DATABASE)
 
 
-q=r'''
+q=r"""
 CREATE EXTERNAL TABLE as_raw_table 
     (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), 
     address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), 
@@ -83,8 +83,8 @@
     'classification'='csv', 
     'skip.header.line.count'='1',  
     'serialization.null.format'='')
-'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table') 
-utils.athena_start_query(q)
+""".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table') 
+athenautils.athena_start_query(q, database=config.DATABASE)
 
 
 df = pd.read_csv(contributions_txt_file, sep='\t', escapechar='\\', quoting=csv.QUOTE_NONE,  
@@ -114,14 +114,14 @@
 # df = df.replace(r'^\s*$', np.nan, regex=True)
 df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand')
 
-utils.write(body=df_lower.to_csv(quoting=csv.QUOTE_NONE, sep="\t", escapechar='\\', index=None),
+athenautils.write(body=df_lower.to_csv(quoting=csv.QUOTE_NONE, sep="\t", escapechar='\\', index=None),
            filename=os.path.join("s3://", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'as_raw_table', contributions_txt_file,))
 
 # Athena is doesn't equate empty string and null, eventhough in the table spec we said so
 # Not that it's a bug, it works if the string is null in the source, but not after applying trim to it
 # So we need to manually take care of that
 print('creating donors table...')
-q='''
+q="""
 CREATE TABLE as_donors as
     with tmp as
       (SELECT DISTINCT 
@@ -135,15 +135,15 @@
            NULLIF(TRIM(employer), '') as employer, 
            NULLIF(TRIM(occupation), '') as occupation
       FROM as_raw_table)
-    SELECT row_number() over () as donor_id, * from tmp'''
-utils.athena_start_query(q)
+    SELECT row_number() over () as donor_id, * from tmp"""
+athenautils.athena_start_query(q, database=config.DATABASE)
 
 
-q='''
+q="""
 CREATE TABLE as_recipients as
     SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM as_raw_table
-'''
-utils.athena_start_query(q)
+"""
+athenautils.athena_start_query(q, database=config.DATABASE)
 
 print('creating contributions table')
 
@@ -163,7 +163,7 @@
 #           "CHARACTER SET utf8 COLLATE utf8_unicode_ci")
 # --
 
-q='''
+q="""
 CREATE TABLE as_contributions as
     SELECT reciept_id as contribution_id, 
         donors.donor_id as donor_id , 
@@ -185,11 +185,11 @@
         coalesce(donors.state, '') = coalesce(TRIM(as_raw_table.state), '') AND 
         coalesce(donors.employer, '') = coalesce(TRIM(as_raw_table.employer), '') AND 
         coalesce(donors.occupation , '')= coalesce(TRIM(as_raw_table.occupation), '') AND 
-        coalesce(donors.zip, '') = coalesce(TRIM(as_raw_table.zip), '')'''
+        coalesce(donors.zip, '') = coalesce(TRIM(as_raw_table.zip), '')"""
 
-utils.athena_start_query(q)
+athenautils.athena_start_query(q, database=config.DATABASE)
 
-q = '''
+q = """
 CREATE TABLE as_processed_donors AS  
     SELECT donor_id,  
      LOWER(city) AS city,  
@@ -206,8 +206,8 @@
      LOWER(occupation) AS occupation, 
      LOWER(employer) AS employer, 
      first_name is null AS person 
- FROM as_donors'''
-utils.athena_start_query(q)
+ FROM as_donors"""
+athenautils.athena_start_query(q, database=config.DATABASE)
 
 
 
diff --git a/athena_example/utils.py b/athena_example/athenautils.py
similarity index 61%
rename from athena_example/utils.py
rename to athena_example/athenautils.py
index 1b8b935a..3cd8e4dd 100644
--- a/athena_example/utils.py
+++ b/athena_example/athenautils.py
@@ -27,13 +27,35 @@
 athena = boto3.client('athena', region_name=config.REGION, 
                       aws_access_key_id=config.ACCESS_KEY_ID, aws_secret_access_key=config.SECRET_ACCESS_KEY)
 
-def athena_to_panda(query, database=config.DATABASE, output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, **kwargs):
-    query_execution_id = athena_start_query(query, database, output_location, region, workgroup, wait_until_finished=True)
+def cursor_execute(query, database=None, cursortype='dict', buffersize=config.BUFFERSIZE, 
+                   output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, 
+                   **kwargs):
+    
+    kwargs['chunksize']=buffersize
+    df_cur = athena_to_panda(query, database=database, 
+                             output_location=output_location, region=region, workgroup=workgroup, 
+                             **kwargs)
+    for df in df_cur:
+        if cursortype == 'dict':
+            all_rows = df.where(pd.notnull(df), None).to_dict('records')
+        if cursortype == 'tuple':
+            all_rows = df.where(pd.notnull(df), None).itertuples(index=False, name=None)
+        for row in all_rows:
+            yield row
+            
+            
+def athena_to_panda(query, database=None, 
+                    output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, 
+                    **kwargs):
+    query_execution_id = athena_start_query(query, database=database,
+                                            output_location=output_location, region=region, workgroup=workgroup,
+                                            wait_until_finished=True)
     df = pandas_read_csv(os.path.join(output_location, query_execution_id+'.csv'), **kwargs)
     return df
 
-
-def athena_start_query(query, database=config.DATABASE, output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, wait_until_finished=True):
+def athena_start_query(query, database=None, 
+                       output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, 
+                       wait_until_finished=True):
     query_execution_id = athena.start_query_execution(
         QueryString=query,
         QueryExecutionContext={
@@ -88,13 +110,12 @@ def list_all(path):
     return listdir(path)
     
 
-def pandas_read_csv(filepath_or_buffer, verbose=True, **kwargs):
+def pandas_read_csv(filepath_or_buffer, **kwargs):
     bucket, key = seperate_bucket_key(filepath_or_buffer)
     obj = s3.get_object(Bucket=bucket, Key=key)
     return pd.read_csv(SomethingIO(obj['Body'].read()),  **kwargs)
 
-def read(filename, verbose=True):
-    log ("Reading {}".format(filename), verbose=verbose)
+def read(filename):
     if is_s3_url(filename):
         bucket, key = seperate_bucket_key(filename)
         obj=s3.get_object(Bucket=bucket, Key=key)
@@ -107,7 +128,24 @@ def write(body, filename):
     s3.put_object(Bucket=bucket, Key=key, Body=body)
     return
         
-    
+
+def file_name_append(filename, append, ommitext):
+    filename_base, ext  = os.path.splitext(filename)
+    if ommitext: 
+        return '%s%s' % (filename_base, append)
+    return '%s%s%s' % (filename_base, append, ext)
+
+def write_many(read_cursor, filename, buffersize=config.BUFFERSIZE):
+    chunkcount=0
+    while True:
+        buffer_df = pd.DataFrame.from_records(read_cursor, nrows=buffersize)
+        if buffer_df.empty: 
+            break        
+        buffer = buffer_df.to_csv(index=False, header=False, sep='\t')
+        chunk_fname = file_name_append(filename, '_{}'.format(chunkcount), ommitext=False)
+        write(buffer, chunk_fname)
+        chunkcount += 1
+        
 def file_exists(filename):
     bucket, key = seperate_bucket_key(filename)
     try:
@@ -120,20 +158,4 @@ def file_exists(filename):
             raise
     else:
         return True
-    
-    
-def log(outstr, logfile_name=config.LOG_FILE, timestamped=True, verbose=True, quiet=False):
-    if verbose == False:
-        return
-    if timestamped:
-        outstr = "[%s]\t%s\n" % (str(datetime.datetime.now()) , outstr)
-    else:
-        outstr = "%s\n" % (outstr,)
-
-    with open(logfile_name, "a") as logfile:
-        logfile.write(outstr)
 
-    if not quiet:
-        sys.stdout.write(outstr);
-        sys.stdout.flush()
-# Print iterations progress
diff --git a/athena_example/config.py b/athena_example/config.py
index 3715b750..f8e4a24b 100644
--- a/athena_example/config.py
+++ b/athena_example/config.py
@@ -9,4 +9,5 @@
 
 # Database Parameters
 DATABASE_BUCKET = 'ria-temp'
-DATABASE_ROOT_KEY = 'as-dedupe/'
+DATABASE_ROOT_KEY = 'as_dedupe/'
+BUFFERSIZE = 100000
diff --git a/notebooks/athena_example.ipynb b/notebooks/athena_example.ipynb
index da896697..e3a0e7b7 100644
--- a/notebooks/athena_example.ipynb
+++ b/notebooks/athena_example.ipynb
@@ -1,23 +1,64 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Note: \n",
+    "Looks good, but check the sanity check notebook to makesure everything is correct"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: dedupe in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (2.0.6)\n",
+      "Requirement already satisfied: categorical-distance>=1.9 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.9)\n",
+      "Requirement already satisfied: dedupe-variable-datetime in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (0.1.5)\n",
+      "Requirement already satisfied: affinegap>=1.3 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.11)\n",
+      "Requirement already satisfied: highered>=0.2.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (0.2.1)\n",
+      "Requirement already satisfied: typing-extensions in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (3.7.4.3)\n",
+      "Requirement already satisfied: simplecosine>=1.2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.2)\n",
+      "Requirement already satisfied: doublemetaphone in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (0.1)\n",
+      "Requirement already satisfied: fastcluster in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.1.26)\n",
+      "Requirement already satisfied: rlr>=2.4.3 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (2.4.5)\n",
+      "Requirement already satisfied: haversine>=0.4.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (2.3.0)\n",
+      "Requirement already satisfied: BTrees>=4.1.4 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (4.7.2)\n",
+      "Requirement already satisfied: numpy>=1.13 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.18.1)\n",
+      "Requirement already satisfied: zope.index in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (5.0.0)\n",
+      "Requirement already satisfied: dedupe-hcluster in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (0.3.8)\n",
+      "Requirement already satisfied: Levenshtein-search in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.4.5)\n",
+      "Requirement already satisfied: future in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe-variable-datetime->dedupe) (0.18.2)\n",
+      "Requirement already satisfied: datetime-distance in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe-variable-datetime->dedupe) (0.1.3)\n",
+      "Requirement already satisfied: pyhacrf-datamade>=0.2.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from highered>=0.2.0->dedupe) (0.2.5)\n",
+      "Requirement already satisfied: pylbfgs in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from rlr>=2.4.3->dedupe) (0.2.0.13)\n",
+      "Requirement already satisfied: zope.interface in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from BTrees>=4.1.4->dedupe) (5.1.2)\n",
+      "Requirement already satisfied: persistent>=4.1.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from BTrees>=4.1.4->dedupe) (4.6.4)\n",
+      "Requirement already satisfied: six in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from zope.index->dedupe) (1.14.0)\n",
+      "Requirement already satisfied: setuptools in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from zope.index->dedupe) (45.2.0.post20200210)\n",
+      "Requirement already satisfied: python-dateutil>=2.6.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from datetime-distance->dedupe-variable-datetime->dedupe) (2.8.1)\n",
+      "Requirement already satisfied: cffi; platform_python_implementation == \"CPython\" in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from persistent>=4.1.0->BTrees>=4.1.4->dedupe) (1.14.0)\n",
+      "Requirement already satisfied: pycparser in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from cffi; platform_python_implementation == \"CPython\"->persistent>=4.1.0->BTrees>=4.1.4->dedupe) (2.19)\n",
+      "\u001b[33mWARNING: You are using pip version 20.0.2; however, version 20.2.4 is available.\n",
+      "You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.\u001b[0m\n"
+     ]
+    }
+   ],
    "source": [
-    "# !pip install dedupe"
+    "!pip install dedupe"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# %load ../mysql_example/mysql_example.py\n",
-    "#!/usr/bin/python\n",
-    "\n",
     "\"\"\"\n",
     "This is an example of working with very large data. There are about\n",
     "700,000 unduplicated donors in this database of Illinois political\n",
@@ -25,7 +66,7 @@
     "\n",
     "With such a large set of input data, we cannot store all the comparisons\n",
     "we need to make in memory. Instead, we will read the pairs on demand\n",
-    "from the MySQL database.\n",
+    "from the Athena database.\n",
     "\n",
     "__Note:__ You will need to run `python mysql_init_db.py`\n",
     "before running this script. See the annotates source for\n",
@@ -58,10 +99,10 @@
     "sys.path.insert(0, '../athena_example/')\n",
     "import config\n",
     "sys.path.insert(0, '../athena_example/')\n",
-    "import utils\n",
+    "import athenautils\n",
     "\n",
     "def as_pandas(query, **kwrgs):\n",
-    "    df = utils.athena_to_panda(query, escapechar=None, keep_default_na=False, na_values=[''], **kwrgs)\n",
+    "    df = athenautils.athena_to_panda(query, escapechar=None, keep_default_na=False, na_values=[''], **kwrgs)\n",
     "    return df.where(pd.notnull(df), None)\n",
     "\n",
     "def record_pairs(result_set):\n",
@@ -119,17 +160,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "reading from  mysql_example_settings\n"
+     ]
+    }
+   ],
    "source": [
     "    # We'll be using variations on this following select statement to pull\n",
     "    # in campaign donor info.\n",
     "    #\n",
     "    # We did a fair amount of preprocessing of the fields in\n",
     "    # `mysql_init_db.py`    \n",
-    "    DONOR_SELECT = \"SELECT donor_id, city, name, zip, state, address \" \\\n",
-    "                   \"from as_processed_donors\"\n",
+    "    DONOR_SELECT = \"\"\"SELECT donor_id, city, name, zip, state, address\n",
+    "                      from as_processed_donors\"\"\"\n",
     "\n",
     "    # ## Training\n",
     "\n",
@@ -155,13 +204,8 @@
     "        deduper = dedupe.Dedupe(fields, num_cores=4)\n",
     "\n",
     "        # We will sample pairs from the entire donor table for training\n",
-    "#         with read_con.cursor() as cur:\n",
-    "\n",
-    "        # Armin: The problem is the donor_id, it's numpy's int64, should be converted to int! \n",
-    "        # But for that, astype doesn't work, and a loop on temp_d is slow, so for now let's just use str\n",
-    "#         with conn.cursor(PandasCursor, schema_name=schema_name) as cursor:\n",
-    "        temp_df = as_pandas(DONOR_SELECT)\n",
-    "        temp_d = temp_df.to_dict('index')\n",
+    "        cur = cur_execute(DONOR_SELECT)\n",
+    "        temp_d = {i: row for i, row in enumerate(cur)}\n",
     "            \n",
     "\n",
     "        # If we have training data saved from a previous run of dedupe,\n",
@@ -209,9 +253,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "blocking...\n",
+      "creating as_blocking_map database\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'5651b314-d20b-4404-aa8d-30df70804e0e'"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "    # ## Blocking\n",
     "\n",
@@ -220,9 +283,9 @@
     "    # To run blocking on such a large set of data, we create a separate table\n",
     "    # that contains blocking keys and record ids\n",
     "    print('creating as_blocking_map database')\n",
-    "    utils.athena_start_query(\"DROP TABLE IF EXISTS as_blocking_map\")\n",
+    "    athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_blocking_map\", database=config.DATABASE)\n",
     "\n",
-    "    q='''\n",
+    "    q=\"\"\"\n",
     "    CREATE EXTERNAL TABLE as_blocking_map     \n",
     "        (block_key VARCHAR(200), donor_id INTEGER)\n",
     "    ROW FORMAT DELIMITED\n",
@@ -234,15 +297,23 @@
     "        'classification'='csv', \n",
     "        --'skip.header.line.count'='1',  \n",
     "        'serialization.null.format'='')\n",
-    "    '''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map') \n",
-    "    utils.athena_start_query(q)"
+    "    \"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map') \n",
+    "    athenautils.athena_start_query(q, database=config.DATABASE)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "creating inverted index\n"
+     ]
+    }
+   ],
    "source": [
     "    # If dedupe learned a Index Predicate, we have to take a pass\n",
     "    # through the data and create indices.\n",
@@ -251,94 +322,785 @@
     "    # Armin: \n",
     "    # This never runs, index_fields is empty, possible bug?\n",
     "    for field in deduper.fingerprinter.index_fields:\n",
-    "        q = '''\n",
-    "        SELECT DISTINCT {field} FROM as_processed_donors \n",
+    "        q = \"\"\"\n",
+    "        SELECT DISTINCT {field} FROM as_processed_donors\n",
     "        WHERE {field} IS NOT NULL\n",
-    "        '''.format(field=field)\n",
-    "        cur_df = as_pandas(q)\n",
-    "        # Do I need to cast it as a list?\n",
-    "        field_data = cur_df[field]\n",
+    "        \"\"\".format(field=field)\n",
+    "        cur = cur_execute(q)\n",
+    "        field_data = (row[field] for row in cur)\n",
     "        deduper.fingerprinter.index(field_data, field)\n",
     "     "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "writing blocking map\n"
+     ]
+    }
+   ],
    "source": [
     "    # Now we are ready to write our blocking map table by creating a\n",
     "    # generator that yields unique `(block_key, donor_id)` tuples.\n",
     "    print('writing blocking map')\n",
     "    \n",
-    "\n",
-    "    read_cur_dict = as_pandas(DONOR_SELECT).to_dict('records')\n",
-    "    full_data = ((row['donor_id'], row) for row in read_cur_dict)"
+    "    read_cur  = athenautils.cursor_execute(DONOR_SELECT, database=config.DATABASE)\n",
+    "    full_data = ((row['donor_id'], row) for row in read_cur)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_0.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_1.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_2.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_3.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_4.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_5.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_6.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_7.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_8.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_9.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_10.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_11.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_12.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_13.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_14.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_15.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_16.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_17.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_18.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_19.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_20.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_21.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_22.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_23.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_24.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_25.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_26.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_27.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_28.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_29.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_30.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_31.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_32.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_33.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_34.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_35.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_36.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_37.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_38.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_39.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_40.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_41.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_42.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_43.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_44.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_45.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_46.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_47.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_48.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_49.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_50.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_51.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_52.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_53.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_54.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_55.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_56.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_57.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_58.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_59.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_60.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_61.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_62.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_63.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_64.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_65.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_66.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_67.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_68.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_69.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_70.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_71.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_72.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_73.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_74.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_75.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_76.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_77.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_78.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_79.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_80.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_81.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_82.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_83.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_84.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_85.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_86.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_87.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_88.csv\n",
+      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_89.csv\n"
+     ]
+    }
+   ],
    "source": [
     "    b_data = deduper.fingerprinter(full_data)\n",
-    "    buffer = pd.DataFrame.from_records(b_data).to_csv(index=False, header=False, sep='\\t')\n",
-    "    utils.s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'as_blocking_map/blocking.csv', Body=buffer)    \n"
+    "    athenautils.write_many(b_data, \n",
+    "                           filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY, 'as_blocking_map/blocking.csv'))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [],
    "source": [
     "\n",
     "    # select unique pairs to compare\n",
-    "    q='''\n",
-    "    SELECT a.donor_id,\n",
-    "        json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'],\n",
-    "                              ARRAY[ a.city, a.name, a.zip, a.state, a.address])\n",
-    "                    AS JSON)),\n",
-    "        b.donor_id,\n",
-    "        json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], \n",
-    "                  ARRAY[ b.city, b.name, b.zip, b.state, b.address])\n",
-    "              AS JSON))\n",
-    "    FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west\n",
-    "         from as_blocking_map as l\n",
-    "         INNER JOIN as_blocking_map as r\n",
-    "         using (block_key)\n",
-    "         where l.donor_id < r.donor_id) ids\n",
-    "    INNER JOIN as_processed_donors a on ids.east=a.donor_id\n",
-    "    INNER JOIN as_processed_donors b on ids.west=b.donor_id\n",
-    "    '''\n",
-    "    read_cur_dict=as_pandas(q).itertuples(index=False, name=None)"
+    "    q=\"\"\"\n",
+    "        SELECT a.donor_id,\n",
+    "            json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'],\n",
+    "                                  ARRAY[ a.city, a.name, a.zip, a.state, a.address])\n",
+    "                        AS JSON)),\n",
+    "            b.donor_id,\n",
+    "            json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], \n",
+    "                      ARRAY[ b.city, b.name, b.zip, b.state, b.address])\n",
+    "                  AS JSON))\n",
+    "        FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west\n",
+    "             from as_blocking_map as l\n",
+    "             INNER JOIN as_blocking_map as r\n",
+    "             using (block_key)\n",
+    "             where l.donor_id < r.donor_id) ids\n",
+    "        INNER JOIN as_processed_donors a on ids.east=a.donor_id\n",
+    "        INNER JOIN as_processed_donors b on ids.west=b.donor_id\n",
+    "       \"\"\"\n",
+    "    read_cur = athenautils.cursor_execute(q, cursortype='tuple', database=config.DATABASE)\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 25,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "clustering...\n",
+      "0\n",
+      "10000\n",
+      "20000\n",
+      "30000\n",
+      "40000\n",
+      "50000\n",
+      "60000\n",
+      "70000\n",
+      "80000\n",
+      "90000\n",
+      "100000\n",
+      "110000\n",
+      "120000\n",
+      "130000\n",
+      "140000\n",
+      "150000\n",
+      "160000\n",
+      "170000\n",
+      "180000\n",
+      "190000\n",
+      "200000\n",
+      "210000\n",
+      "220000\n",
+      "230000\n",
+      "240000\n",
+      "250000\n",
+      "260000\n",
+      "270000\n",
+      "280000\n",
+      "290000\n",
+      "300000\n",
+      "310000\n",
+      "320000\n",
+      "330000\n",
+      "340000\n",
+      "350000\n",
+      "360000\n",
+      "370000\n",
+      "380000\n",
+      "390000\n",
+      "400000\n",
+      "410000\n",
+      "420000\n",
+      "430000\n",
+      "440000\n",
+      "450000\n",
+      "460000\n",
+      "470000\n",
+      "480000\n",
+      "490000\n",
+      "500000\n",
+      "510000\n",
+      "520000\n",
+      "530000\n",
+      "540000\n",
+      "550000\n",
+      "560000\n",
+      "570000\n",
+      "580000\n",
+      "590000\n",
+      "600000\n",
+      "610000\n",
+      "620000\n",
+      "630000\n",
+      "640000\n",
+      "650000\n",
+      "660000\n",
+      "670000\n",
+      "680000\n",
+      "690000\n",
+      "700000\n",
+      "710000\n",
+      "720000\n",
+      "730000\n",
+      "740000\n",
+      "750000\n",
+      "760000\n",
+      "770000\n",
+      "780000\n",
+      "790000\n",
+      "800000\n",
+      "810000\n",
+      "820000\n",
+      "830000\n",
+      "840000\n",
+      "850000\n",
+      "860000\n",
+      "870000\n",
+      "880000\n",
+      "890000\n",
+      "900000\n",
+      "910000\n",
+      "920000\n",
+      "930000\n",
+      "940000\n",
+      "950000\n",
+      "960000\n",
+      "970000\n",
+      "980000\n",
+      "990000\n",
+      "1000000\n",
+      "1010000\n",
+      "1020000\n",
+      "1030000\n",
+      "1040000\n",
+      "1050000\n",
+      "1060000\n",
+      "1070000\n",
+      "1080000\n",
+      "1090000\n",
+      "1100000\n",
+      "1110000\n",
+      "1120000\n",
+      "1130000\n",
+      "1140000\n",
+      "1150000\n",
+      "1160000\n",
+      "1170000\n",
+      "1180000\n",
+      "1190000\n",
+      "1200000\n",
+      "1210000\n",
+      "1220000\n",
+      "1230000\n",
+      "1240000\n",
+      "1250000\n",
+      "1260000\n",
+      "1270000\n",
+      "1280000\n",
+      "1290000\n",
+      "1300000\n",
+      "1310000\n",
+      "1320000\n",
+      "1330000\n",
+      "1340000\n",
+      "1350000\n",
+      "1360000\n",
+      "1370000\n",
+      "1380000\n",
+      "1390000\n",
+      "1400000\n",
+      "1410000\n",
+      "1420000\n",
+      "1430000\n",
+      "1440000\n",
+      "1450000\n",
+      "1460000\n",
+      "1470000\n",
+      "1480000\n",
+      "1490000\n",
+      "1500000\n",
+      "1510000\n",
+      "1520000\n",
+      "1530000\n",
+      "1540000\n",
+      "1550000\n",
+      "1560000\n",
+      "1570000\n",
+      "1580000\n",
+      "1590000\n",
+      "1600000\n",
+      "1610000\n",
+      "1620000\n",
+      "1630000\n",
+      "1640000\n",
+      "1650000\n",
+      "1660000\n",
+      "1670000\n",
+      "1680000\n",
+      "1690000\n",
+      "1700000\n",
+      "1710000\n",
+      "1720000\n",
+      "1730000\n",
+      "1740000\n",
+      "1750000\n",
+      "1760000\n",
+      "1770000\n",
+      "1780000\n",
+      "1790000\n",
+      "1800000\n",
+      "1810000\n",
+      "1820000\n",
+      "1830000\n",
+      "1840000\n",
+      "1850000\n",
+      "1860000\n",
+      "1870000\n",
+      "1880000\n",
+      "1890000\n",
+      "1900000\n",
+      "1910000\n",
+      "1920000\n",
+      "1930000\n",
+      "1940000\n",
+      "1950000\n",
+      "1960000\n",
+      "1970000\n",
+      "1980000\n",
+      "1990000\n",
+      "2000000\n",
+      "2010000\n",
+      "2020000\n",
+      "2030000\n",
+      "2040000\n",
+      "2050000\n",
+      "2060000\n",
+      "2070000\n",
+      "2080000\n",
+      "2090000\n",
+      "2100000\n",
+      "2110000\n",
+      "2120000\n",
+      "2130000\n",
+      "2140000\n",
+      "2150000\n",
+      "2160000\n",
+      "2170000\n",
+      "2180000\n",
+      "2190000\n",
+      "2200000\n",
+      "2210000\n",
+      "2220000\n",
+      "2230000\n",
+      "2240000\n",
+      "2250000\n",
+      "2260000\n",
+      "2270000\n",
+      "2280000\n",
+      "2290000\n",
+      "2300000\n",
+      "2310000\n",
+      "2320000\n",
+      "2330000\n",
+      "2340000\n",
+      "2350000\n",
+      "2360000\n",
+      "2370000\n",
+      "2380000\n",
+      "2390000\n",
+      "2400000\n",
+      "2410000\n",
+      "2420000\n",
+      "2430000\n",
+      "2440000\n",
+      "2450000\n",
+      "2460000\n",
+      "2470000\n",
+      "2480000\n",
+      "2490000\n",
+      "2500000\n",
+      "2510000\n",
+      "2520000\n",
+      "2530000\n",
+      "2540000\n",
+      "2550000\n",
+      "2560000\n",
+      "2570000\n",
+      "2580000\n",
+      "2590000\n",
+      "2600000\n",
+      "2610000\n",
+      "2620000\n",
+      "2630000\n",
+      "2640000\n",
+      "2650000\n",
+      "2660000\n",
+      "2670000\n",
+      "2680000\n",
+      "2690000\n",
+      "2700000\n",
+      "2710000\n",
+      "2720000\n",
+      "2730000\n",
+      "2740000\n",
+      "2750000\n",
+      "2760000\n",
+      "2770000\n",
+      "2780000\n",
+      "2790000\n",
+      "2800000\n",
+      "2810000\n",
+      "2820000\n",
+      "2830000\n",
+      "2840000\n",
+      "2850000\n",
+      "2860000\n",
+      "2870000\n",
+      "2880000\n",
+      "2890000\n",
+      "2900000\n",
+      "2910000\n",
+      "2920000\n",
+      "2930000\n",
+      "2940000\n",
+      "2950000\n",
+      "2960000\n",
+      "2970000\n",
+      "2980000\n",
+      "2990000\n",
+      "3000000\n",
+      "3010000\n",
+      "3020000\n",
+      "3030000\n",
+      "3040000\n",
+      "3050000\n",
+      "3060000\n",
+      "3070000\n",
+      "3080000\n",
+      "3090000\n",
+      "3100000\n",
+      "3110000\n",
+      "3120000\n",
+      "3130000\n",
+      "3140000\n",
+      "3150000\n",
+      "3160000\n",
+      "3170000\n",
+      "3180000\n",
+      "3190000\n",
+      "3200000\n",
+      "3210000\n",
+      "3220000\n",
+      "3230000\n",
+      "3240000\n",
+      "3250000\n",
+      "3260000\n",
+      "3270000\n",
+      "3280000\n",
+      "3290000\n",
+      "3300000\n",
+      "3310000\n",
+      "3320000\n",
+      "3330000\n",
+      "3340000\n",
+      "3350000\n",
+      "3360000\n",
+      "3370000\n",
+      "3380000\n",
+      "3390000\n",
+      "3400000\n",
+      "3410000\n",
+      "3420000\n",
+      "3430000\n",
+      "3440000\n",
+      "3450000\n",
+      "3460000\n",
+      "3470000\n",
+      "3480000\n",
+      "3490000\n",
+      "3500000\n",
+      "3510000\n",
+      "3520000\n",
+      "3530000\n",
+      "3540000\n",
+      "3550000\n",
+      "3560000\n",
+      "3570000\n",
+      "3580000\n",
+      "3590000\n",
+      "3600000\n",
+      "3610000\n",
+      "3620000\n",
+      "3630000\n",
+      "3640000\n",
+      "3650000\n",
+      "3660000\n",
+      "3670000\n",
+      "3680000\n",
+      "3690000\n",
+      "3700000\n",
+      "3710000\n",
+      "3720000\n",
+      "3730000\n",
+      "3740000\n",
+      "3750000\n",
+      "3760000\n",
+      "3770000\n",
+      "3780000\n",
+      "3790000\n",
+      "3800000\n",
+      "3810000\n",
+      "3820000\n",
+      "3830000\n",
+      "3840000\n",
+      "3850000\n",
+      "3860000\n",
+      "3870000\n",
+      "3880000\n",
+      "3890000\n",
+      "3900000\n",
+      "3910000\n",
+      "3920000\n",
+      "3930000\n",
+      "3940000\n",
+      "3950000\n",
+      "3960000\n",
+      "3970000\n",
+      "3980000\n",
+      "3990000\n",
+      "4000000\n",
+      "4010000\n",
+      "4020000\n",
+      "4030000\n",
+      "4040000\n",
+      "4050000\n",
+      "4060000\n",
+      "4070000\n",
+      "4080000\n",
+      "4090000\n",
+      "4100000\n",
+      "4110000\n",
+      "4120000\n",
+      "4130000\n",
+      "4140000\n",
+      "4150000\n",
+      "4160000\n",
+      "4170000\n",
+      "4180000\n",
+      "4190000\n",
+      "4200000\n",
+      "4210000\n",
+      "4220000\n",
+      "4230000\n",
+      "4240000\n",
+      "4250000\n",
+      "4260000\n",
+      "4270000\n",
+      "4280000\n",
+      "4290000\n",
+      "4300000\n",
+      "4310000\n",
+      "4320000\n",
+      "4330000\n",
+      "4340000\n",
+      "4350000\n",
+      "4360000\n",
+      "4370000\n",
+      "4380000\n",
+      "4390000\n",
+      "4400000\n",
+      "4410000\n",
+      "4420000\n",
+      "4430000\n",
+      "4440000\n",
+      "4450000\n",
+      "4460000\n",
+      "4470000\n",
+      "4480000\n",
+      "4490000\n",
+      "4500000\n",
+      "4510000\n",
+      "4520000\n",
+      "4530000\n",
+      "4540000\n",
+      "4550000\n",
+      "4560000\n",
+      "4570000\n",
+      "4580000\n",
+      "4590000\n",
+      "4600000\n",
+      "4610000\n",
+      "4620000\n",
+      "4630000\n",
+      "4640000\n",
+      "4650000\n",
+      "4660000\n",
+      "4670000\n",
+      "4680000\n",
+      "4690000\n",
+      "4700000\n",
+      "4710000\n",
+      "4720000\n",
+      "4730000\n",
+      "4740000\n",
+      "4750000\n",
+      "4760000\n",
+      "4770000\n",
+      "4780000\n",
+      "4790000\n",
+      "4800000\n",
+      "4810000\n",
+      "4820000\n",
+      "4830000\n",
+      "4840000\n",
+      "4850000\n",
+      "4860000\n",
+      "4870000\n",
+      "4880000\n",
+      "4890000\n",
+      "4900000\n",
+      "4910000\n",
+      "4920000\n",
+      "4930000\n",
+      "4940000\n",
+      "4950000\n",
+      "4960000\n",
+      "4970000\n",
+      "4980000\n",
+      "4990000\n",
+      "5000000\n",
+      "5010000\n",
+      "5020000\n",
+      "5030000\n",
+      "5040000\n",
+      "5050000\n",
+      "5060000\n",
+      "5070000\n",
+      "5080000\n",
+      "5090000\n",
+      "5100000\n",
+      "5110000\n",
+      "5120000\n"
+     ]
+    }
+   ],
    "source": [
     "    # ## Clustering\n",
     "\n",
     "    print('clustering...')\n",
-    "    clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur_dict)),\n",
+    "    clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)),\n",
     "                                      threshold=0.5)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 29,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "creating as_entity_map database\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:dedupe.clustering:A component contained 158378 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 3.048800940974382e-18\n",
+      "WARNING:dedupe.clustering:A component contained 158378 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.1038451096258602e-17\n",
+      "WARNING:dedupe.clustering:A component contained 158378 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 4.921472802362437e-17\n",
+      "WARNING:dedupe.clustering:A component contained 158376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.3509749469939632e-16\n",
+      "WARNING:dedupe.clustering:A component contained 158376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 3.753198598980096e-16\n",
+      "WARNING:dedupe.clustering:A component contained 158376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.0403512617910142e-15\n",
+      "WARNING:dedupe.clustering:A component contained 158376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 2.894390795098561e-15\n",
+      "WARNING:dedupe.clustering:A component contained 158376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 8.13221083415663e-15\n",
+      "WARNING:dedupe.clustering:A component contained 158376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 2.2319167959587083e-14\n",
+      "WARNING:dedupe.clustering:A component contained 158375 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 6.068761194789993e-14\n",
+      "WARNING:dedupe.clustering:A component contained 158372 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.6520029686587212e-13\n",
+      "WARNING:dedupe.clustering:A component contained 158364 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 4.5125040046676256e-13\n",
+      "WARNING:dedupe.clustering:A component contained 158352 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.2269463311618112e-12\n",
+      "WARNING:dedupe.clustering:A component contained 158314 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 3.3356439660236965e-12\n",
+      "WARNING:dedupe.clustering:A component contained 157999 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 9.069763004960628e-12\n",
+      "WARNING:dedupe.clustering:A component contained 157528 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 2.4668471436592435e-11\n",
+      "WARNING:dedupe.clustering:A component contained 157002 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 6.705726454279488e-11\n",
+      "WARNING:dedupe.clustering:A component contained 156034 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.822899310600237e-10\n",
+      "WARNING:dedupe.clustering:A component contained 153167 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 4.955617200886392e-10\n",
+      "WARNING:dedupe.clustering:A component contained 150749 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.3472511446191333e-09\n",
+      "WARNING:dedupe.clustering:A component contained 148126 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 3.663451859737113e-09\n",
+      "WARNING:dedupe.clustering:A component contained 144445 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 9.961619074337575e-09\n",
+      "WARNING:dedupe.clustering:A component contained 140752 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 2.7079365851019727e-08\n",
+      "WARNING:dedupe.clustering:A component contained 136821 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 7.361173277021834e-08\n",
+      "WARNING:dedupe.clustering:A component contained 132985 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 2.00129481181664e-07\n",
+      "WARNING:dedupe.clustering:A component contained 129188 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 5.440113266301783e-07\n",
+      "WARNING:dedupe.clustering:A component contained 126461 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.4789049802767608e-06\n",
+      "WARNING:dedupe.clustering:A component contained 124279 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 4.020875427015852e-06\n",
+      "WARNING:dedupe.clustering:A component contained 121039 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.0930919387732102e-05\n",
+      "WARNING:dedupe.clustering:A component contained 117376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 2.971327846301476e-05\n",
+      "WARNING:dedupe.clustering:A component contained 114455 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 8.076722851404745e-05\n",
+      "WARNING:dedupe.clustering:A component contained 109969 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.0002195200394847895\n",
+      "WARNING:dedupe.clustering:A component contained 106867 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.0005965000236037636\n",
+      "WARNING:dedupe.clustering:A component contained 101488 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.001619959237855584\n",
+      "WARNING:dedupe.clustering:A component contained 94945 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.004391296209574281\n",
+      "WARNING:dedupe.clustering:A component contained 89944 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.01184744490841891\n",
+      "WARNING:dedupe.clustering:A component contained 85759 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.031562819826922474\n",
+      "WARNING:dedupe.clustering:A component contained 79119 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.08138832068049236\n",
+      "WARNING:dedupe.clustering:A component contained 73185 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.1940994212134007\n",
+      "WARNING:dedupe.clustering:A component contained 67046 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.39565940016357204\n",
+      "WARNING:dedupe.clustering:A component contained 57601 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.6402437741869547\n",
+      "WARNING:dedupe.clustering:A component contained 36731 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.8286982262391892\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "s3://ria-temp/as_dedupe/as_entity_map/as_entity_map_0.csv\n",
+      "s3://ria-temp/as_dedupe/as_entity_map/as_entity_map_1.csv\n",
+      "s3://ria-temp/as_dedupe/as_entity_map/as_entity_map_2.csv\n",
+      "s3://ria-temp/as_dedupe/as_entity_map/as_entity_map_3.csv\n",
+      "s3://ria-temp/as_dedupe/as_entity_map/as_entity_map_4.csv\n"
+     ]
+    }
+   ],
    "source": [
-    "    utils.athena_start_query(\"DROP TABLE IF EXISTS as_entity_map\")\n",
+    "    athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_entity_map\", database=config.DATABASE)\n",
     "\n",
     "    print('creating as_entity_map database')\n",
-    "    q='''\n",
+    "    q=\"\"\"\n",
     "    CREATE EXTERNAL TABLE as_entity_map     \n",
     "        (donor_id INTEGER, canon_id INTEGER, \n",
     "         cluster_score FLOAT)\n",
@@ -351,18 +1113,49 @@
     "        'classification'='csv', \n",
     "        --'skip.header.line.count'='1',  \n",
     "        'serialization.null.format'='')\n",
-    "    '''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map') \n",
-    "    utils.athena_start_query(q) \n",
+    "    \"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map') \n",
+    "    athenautils.athena_start_query(q, database=config.DATABASE) \n",
     "\n",
-    "    buffer = pd.DataFrame.from_records(cluster_ids(clustered_dupes)).to_csv(index=False, header=False, sep='\\t')\n",
-    "    utils.s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'as_entity_map/as_entity_map.csv', Body=buffer)    \n"
+    "    athenautils.write_many(cluster_ids(clustered_dupes),\n",
+    "                          filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY, 'as_entity_map/as_entity_map.csv'))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 34,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "# duplicate sets\n",
+      "Top Donors (deduped)\n",
+      "      $32,146,134.06: democratic party of illinois\n",
+      "      $13,762,181.54: republican state senate campaign committee\n",
+      "       $9,590,682.54: republican governors association\n",
+      "       $9,040,913.46: madigan michael friends of\n",
+      "       $7,949,218.49: seiu healthcare il in pac\n",
+      "       $6,435,815.20: chicago teachers union, ift local 1\n",
+      "       $6,353,463.90: illinois senate democratic fund (the)\n",
+      "       $6,077,259.02: fred eychaner\n",
+      "       $6,022,884.47: scott cohen\n",
+      "       $5,911,667.89: illinois republican party\n",
+      "Top Donors (raw)\n",
+      "      $14,319,194.47: democratic party of illinois\n",
+      "      $13,020,132.76: democratic party of illinois\n",
+      "       $9,027,432.54: republican governors association\n",
+      "       $7,897,829.31: rga illinois 2010 pac\n",
+      "       $6,675,000.00: madigan michael friends of\n",
+      "       $6,008,841.69: scott cohen\n",
+      "       $5,570,839.00: ronald gidwitz,\n",
+      "       $5,562,800.00: citizens for emil jones\n",
+      "       $5,324,649.63: paul wood,\n",
+      "       $5,132,563.83: seiu healthcare il in\n",
+      "ran in 3723.114373922348 seconds\n"
+     ]
+    }
+   ],
    "source": [
     "    # Print out the number of duplicates found\n",
     "    print('# duplicate sets')\n",
@@ -376,60 +1169,58 @@
     "\n",
     "    locale.setlocale(locale.LC_ALL, 'en_CA.UTF-8')  # for pretty printing numbers\n",
     "    \n",
-    "    utils.athena_start_query(\"DROP TABLE IF EXISTS as_e_map\")\n",
-    "    q = '''\n",
-    "    CREATE TABLE as_e_map as \n",
+    "    athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_e_map\", database=config.DATABASE)\n",
+    "    \n",
+    "    q = \"\"\"\n",
+    "        CREATE TABLE as_e_map as \n",
     "        SELECT COALESCE(canon_id, as_entity_map.donor_id) AS canon_id, as_entity_map.donor_id \n",
     "        FROM as_entity_map \n",
-    "            RIGHT JOIN as_donors USING(donor_id)\n",
-    "        \n",
-    "    '''\n",
+    "        RIGHT JOIN as_donors USING(donor_id)        \n",
+    "        \"\"\"    \n",
+    "    athenautils.athena_start_query(q, database=config.DATABASE)\n",
     "    \n",
-    "    utils.athena_start_query(q)\n",
-    "    q ='''\n",
-    "    SELECT array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,   \n",
-    "        donation_totals.totals AS totals \n",
-    "    FROM as_donors INNER JOIN \n",
-    "        (SELECT canon_id, SUM(cast (amount as double)) AS totals \n",
-    "        FROM as_contributions INNER JOIN as_e_map \n",
-    "        USING (donor_id) \n",
-    "        GROUP BY (canon_id) \n",
-    "        ORDER BY totals \n",
-    "        DESC LIMIT 10) \n",
-    "        AS donation_totals \n",
-    "    ON as_donors.donor_id = donation_totals.canon_id\n",
-    "    ORDER BY totals DESC\n",
-    "    '''\n",
-    "    cur_dict = as_pandas(q).to_dict('records')\n",
+    "    q = \"\"\"\n",
+    "        SELECT array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,   \n",
+    "            donation_totals.totals AS totals \n",
+    "        FROM as_donors INNER JOIN \n",
+    "            (SELECT canon_id, SUM(cast (amount as double)) AS totals \n",
+    "            FROM as_contributions INNER JOIN as_e_map \n",
+    "            USING (donor_id) \n",
+    "            GROUP BY (canon_id) \n",
+    "            ORDER BY totals \n",
+    "            DESC LIMIT 10) \n",
+    "            AS donation_totals \n",
+    "        ON as_donors.donor_id = donation_totals.canon_id\n",
+    "        ORDER BY totals DESC\n",
+    "    \"\"\"\n",
+    "    cur = athenautils.cursor_execute(q, database=config.DATABASE)\n",
     "\n",
     "    print(\"Top Donors (deduped)\")\n",
-    "    for row in cur_dict:\n",
+    "    for row in cur:\n",
     "        row['totals'] = locale.currency(row['totals'], grouping=True)\n",
     "        print('%(totals)20s: %(name)s' % row)\n",
     "\n",
     "    # Compare this to what we would have gotten if we hadn't done any\n",
     "    # deduplication\n",
+    "    q = \"\"\"\n",
+    "        with donorscontributions as(\n",
     "\n",
-    "    q = '''\n",
-    "    with donorscontributions as(\n",
-    "\n",
-    "        SELECT as_donors.donor_id, \n",
-    "            array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,\n",
-    "            cast(as_contributions.amount as double) as amount\n",
-    "        FROM as_donors INNER JOIN as_contributions \n",
-    "            USING (donor_id) \n",
-    "    )\n",
-    "    SELECT name, sum(amount) AS totals  \n",
-    "    FROM donorscontributions\n",
-    "    GROUP BY donor_id, name\n",
-    "    ORDER BY totals DESC \n",
-    "    LIMIT 10\n",
-    "    '''\n",
-    "\n",
-    "    cur_dict = as_pandas(q).to_dict('records')\n",
+    "            SELECT as_donors.donor_id, \n",
+    "                array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,\n",
+    "                cast(as_contributions.amount as double) as amount\n",
+    "            FROM as_donors INNER JOIN as_contributions \n",
+    "                USING (donor_id) \n",
+    "            )\n",
+    "        SELECT name, sum(amount) AS totals  \n",
+    "        FROM donorscontributions\n",
+    "        GROUP BY donor_id, name\n",
+    "        ORDER BY totals DESC \n",
+    "        LIMIT 10\n",
+    "    \"\"\"\n",
+    "    cur = athenautils.cursor_execute(q, database=config.DATABASE)\n",
     "\n",
     "    print(\"Top Donors (raw)\")\n",
-    "    for row in cur_dict:\n",
+    "    for row in cur:\n",
     "        row['totals'] = locale.currency(row['totals'], grouping=True)\n",
     "        print('%(totals)20s: %(name)s' % row)\n",
     "\n",
diff --git a/notebooks/athena_init_db.ipynb b/notebooks/athena_init_db.ipynb
index 19e6f600..a059f520 100644
--- a/notebooks/athena_init_db.ipynb
+++ b/notebooks/athena_init_db.ipynb
@@ -2,9 +2,17 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting ../athena_example/config.py\n"
+     ]
+    }
+   ],
    "source": [
     "%%writefile ../athena_example/config.py\n",
     "LOG_FILE = 'log.txt'\n",
@@ -18,12 +26,13 @@
     "\n",
     "# Database Parameters\n",
     "DATABASE_BUCKET = 'ria-temp'\n",
-    "DATABASE_ROOT_KEY = 'as-dedupe/'"
+    "DATABASE_ROOT_KEY = 'as_dedupe/'\n",
+    "BUFFERSIZE = 100000"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -63,7 +72,7 @@
     "import csv\n",
     "import sys\n",
     "sys.path.insert(0, '../athena_example/')\n",
-    "import utils\n",
+    "import athenautils\n",
     "\n",
     "\n",
     "contributions_zip_file = 'Illinois-campaign-contributions.txt.zip'\n",
@@ -89,14 +98,14 @@
     "\n",
     "\n",
     "print('importing raw data from csv...')\n",
-    "utils.athena_start_query(\"DROP TABLE IF EXISTS as_raw_table\")\n",
-    "utils.athena_start_query(\"DROP TABLE IF EXISTS as_donors\")\n",
-    "utils.athena_start_query(\"DROP TABLE IF EXISTS as_recipients\")\n",
-    "utils.athena_start_query(\"DROP TABLE IF EXISTS as_contributions\")\n",
-    "utils.athena_start_query(\"DROP TABLE IF EXISTS as_processed_donors\")\n",
+    "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_raw_table\", database=config.DATABASE)\n",
+    "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_donors\", database=config.DATABASE)\n",
+    "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_recipients\", database=config.DATABASE)\n",
+    "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_contributions\", database=config.DATABASE)\n",
+    "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_processed_donors\", database=config.DATABASE)\n",
     "\n",
     "\n",
-    "q=r'''\n",
+    "q=r\"\"\"\n",
     "CREATE EXTERNAL TABLE as_raw_table \n",
     "    (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), \n",
     "    address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), \n",
@@ -121,8 +130,8 @@
     "    'classification'='csv', \n",
     "    'skip.header.line.count'='1',  \n",
     "    'serialization.null.format'='')\n",
-    "'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table') \n",
-    "utils.athena_start_query(q)\n",
+    "\"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table') \n",
+    "athenautils.athena_start_query(q, database=config.DATABASE)\n",
     "\n",
     "\n",
     "df = pd.read_csv(contributions_txt_file, sep='\\t', escapechar='\\\\', quoting=csv.QUOTE_NONE,  \n",
@@ -152,14 +161,14 @@
     "# df = df.replace(r'^\\s*$', np.nan, regex=True)\n",
     "df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand')\n",
     "\n",
-    "utils.write(body=df_lower.to_csv(quoting=csv.QUOTE_NONE, sep=\"\\t\", escapechar='\\\\', index=None),\n",
+    "athenautils.write(body=df_lower.to_csv(quoting=csv.QUOTE_NONE, sep=\"\\t\", escapechar='\\\\', index=None),\n",
     "           filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'as_raw_table', contributions_txt_file,))\n",
     "\n",
     "# Athena is doesn't equate empty string and null, eventhough in the table spec we said so\n",
     "# Not that it's a bug, it works if the string is null in the source, but not after applying trim to it\n",
     "# So we need to manually take care of that\n",
     "print('creating donors table...')\n",
-    "q='''\n",
+    "q=\"\"\"\n",
     "CREATE TABLE as_donors as\n",
     "    with tmp as\n",
     "      (SELECT DISTINCT \n",
@@ -173,15 +182,15 @@
     "           NULLIF(TRIM(employer), '') as employer, \n",
     "           NULLIF(TRIM(occupation), '') as occupation\n",
     "      FROM as_raw_table)\n",
-    "    SELECT row_number() over () as donor_id, * from tmp'''\n",
-    "utils.athena_start_query(q)\n",
+    "    SELECT row_number() over () as donor_id, * from tmp\"\"\"\n",
+    "athenautils.athena_start_query(q, database=config.DATABASE)\n",
     "\n",
     "\n",
-    "q='''\n",
+    "q=\"\"\"\n",
     "CREATE TABLE as_recipients as\n",
     "    SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM as_raw_table\n",
-    "'''\n",
-    "utils.athena_start_query(q)\n",
+    "\"\"\"\n",
+    "athenautils.athena_start_query(q, database=config.DATABASE)\n",
     "\n",
     "print('creating contributions table')\n",
     "\n",
@@ -201,7 +210,7 @@
     "#           \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n",
     "# --\n",
     "\n",
-    "q='''\n",
+    "q=\"\"\"\n",
     "CREATE TABLE as_contributions as\n",
     "    SELECT reciept_id as contribution_id, \n",
     "        donors.donor_id as donor_id , \n",
@@ -223,11 +232,11 @@
     "        coalesce(donors.state, '') = coalesce(TRIM(as_raw_table.state), '') AND \n",
     "        coalesce(donors.employer, '') = coalesce(TRIM(as_raw_table.employer), '') AND \n",
     "        coalesce(donors.occupation , '')= coalesce(TRIM(as_raw_table.occupation), '') AND \n",
-    "        coalesce(donors.zip, '') = coalesce(TRIM(as_raw_table.zip), '')'''\n",
+    "        coalesce(donors.zip, '') = coalesce(TRIM(as_raw_table.zip), '')\"\"\"\n",
     "\n",
-    "utils.athena_start_query(q)\n",
+    "athenautils.athena_start_query(q, database=config.DATABASE)\n",
     "\n",
-    "q = '''\n",
+    "q = \"\"\"\n",
     "CREATE TABLE as_processed_donors AS  \n",
     "    SELECT donor_id,  \n",
     "     LOWER(city) AS city,  \n",
@@ -244,8 +253,8 @@
     "     LOWER(occupation) AS occupation, \n",
     "     LOWER(employer) AS employer, \n",
     "     first_name is null AS person \n",
-    " FROM as_donors'''\n",
-    "utils.athena_start_query(q)\n",
+    " FROM as_donors\"\"\"\n",
+    "athenautils.athena_start_query(q, database=config.DATABASE)\n",
     "\n",
     "\n",
     "\n",
@@ -255,9 +264,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "importing raw data from csv...\n",
+      "b'Skipping line 1441352: expected 30 fields, saw 31\\n'\n",
+      "b'Skipping line 1465996: expected 30 fields, saw 31\\n'\n",
+      "b'Skipping line 1495732: expected 30 fields, saw 31\\n'\n",
+      "b'Skipping line 1631504: expected 30 fields, saw 31\\nSkipping line 1631506: expected 30 fields, saw 31\\n'\n",
+      "b'Skipping line 1660260: expected 30 fields, saw 31\\nSkipping line 1660264: expected 30 fields, saw 32\\n'\n",
+      "creating donors table...\n",
+      "creating contributions table\n",
+      "done\n"
+     ]
+    }
+   ],
    "source": [
     "!python ../athena_example/athena_init.py"
    ]

From fbfb323aaca448b68bca1fd16836c1177f8248b7 Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-10-10-10-91.eu-west-1.compute.internal>
Date: Tue, 3 Nov 2020 21:02:51 +0000
Subject: [PATCH 10/19] checkpoint

---
 athena_example/README.md         |   2 +-
 athena_example/athena_example.py | 247 ++++----
 athena_example/athena_init.py    |  87 ++-
 athena_example/athenautils.py    |  28 +-
 athena_example/config.py         |   1 +
 notebooks/athena_example.ipynb   | 934 +++----------------------------
 notebooks/athena_init_db.ipynb   | 148 ++---
 7 files changed, 315 insertions(+), 1132 deletions(-)

diff --git a/athena_example/README.md b/athena_example/README.md
index 3530935d..53442a12 100644
--- a/athena_example/README.md
+++ b/athena_example/README.md
@@ -13,7 +13,7 @@ Once that's all done you can run the example:
 
 ```bash
 cd mysql_example
-python athena_init_db.py 
+python athena_init.py 
 python athena_example.py
 ```
 
diff --git a/athena_example/athena_example.py b/athena_example/athena_example.py
index a738c56c..384172f2 100644
--- a/athena_example/athena_example.py
+++ b/athena_example/athena_example.py
@@ -4,9 +4,6 @@
 # In[ ]:
 
 
-# %load ../mysql_example/mysql_example.py
-#!/usr/bin/python
-
 """
 This is an example of working with very large data. There are about
 700,000 unduplicated donors in this database of Illinois political
@@ -14,7 +11,7 @@
 
 With such a large set of input data, we cannot store all the comparisons
 we need to make in memory. Instead, we will read the pairs on demand
-from the MySQL database.
+from the Athena database.
 
 __Note:__ You will need to run `python mysql_init_db.py`
 before running this script. See the annotates source for
@@ -47,12 +44,23 @@
 sys.path.insert(0, '../athena_example/')
 import config
 sys.path.insert(0, '../athena_example/')
-import utils
+import athenautils
 
-def as_pandas(query, **kwrgs):
-    df = utils.athena_to_panda(query, escapechar=None, keep_default_na=False, na_values=[''], **kwrgs)
-    return df.where(pd.notnull(df), None)
+def cursor_execute(query, database):
+    '''
+    The MySQL compatible Cursor
+    '''
+    return athenautils.cursor_execute(query, database=database, 
+                                      cursortype='tuple', buffersize=config.BUFFERSIZE,
+                                      escapechar=None, keep_default_na=False, na_values=[''])
 
+def dict_cursor_execute(query, database):
+    '''
+    The MySQL compatible DicCursor
+    '''
+    return athenautils.cursor_execute(query, database=database, 
+                                      cursortype='dict', buffersize=config.BUFFERSIZE,
+                                      escapechar=None, keep_default_na=False, na_values=[''])
 def record_pairs(result_set):
     for i, row in enumerate(result_set):
         a_record_id, a_record, b_record_id, b_record = row
@@ -75,7 +83,7 @@ def cluster_ids(clustered_dupes):
 
 if __name__ == '__main__':
 
-    # ## Logging
+    ## Logging
 
     # Dedupe uses Python logging to show or suppress verbose output. Added
     # for convenience.  To enable verbose output, run `python
@@ -113,7 +121,8 @@ def cluster_ids(clustered_dupes):
 #
 # We did a fair amount of preprocessing of the fields in
 # `mysql_init_db.py`    
-DONOR_SELECT = "SELECT donor_id, city, name, zip, state, address "                "from processed_donors"
+DONOR_SELECT = """SELECT donor_id, city, name, zip, state, address
+                  from as_processed_donors"""
 
 # ## Training
 
@@ -139,13 +148,8 @@ def cluster_ids(clustered_dupes):
     deduper = dedupe.Dedupe(fields, num_cores=4)
 
     # We will sample pairs from the entire donor table for training
-#         with read_con.cursor() as cur:
-
-    # Armin: The problem is the donor_id, it's numpy's int64, should be converted to int! 
-    # But for that, astype doesn't work, and a loop on temp_d is slow, so for now let's just use str
-#         with conn.cursor(PandasCursor, schema_name=schema_name) as cursor:
-    temp_df = as_pandas(DONOR_SELECT)
-    temp_d = temp_df.to_dict('index')
+    cur = cur_execute(DONOR_SELECT)
+    temp_d = {i: row for i, row in enumerate(cur)}
         
 
     # If we have training data saved from a previous run of dedupe,
@@ -200,11 +204,13 @@ def cluster_ids(clustered_dupes):
 
 # To run blocking on such a large set of data, we create a separate table
 # that contains blocking keys and record ids
-print('creating blocking_map database')
-utils.athena_start_query("DROP TABLE IF EXISTS blocking_map")
+print('creating as_blocking_map database')
+athenautils.drop_external_table("as_blocking_map", 
+                                location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map'),
+                                database=config.DATABASE)
 
-q='''
-CREATE EXTERNAL TABLE blocking_map     
+q="""
+CREATE EXTERNAL TABLE as_blocking_map     
     (block_key VARCHAR(200), donor_id INTEGER)
 ROW FORMAT DELIMITED
   FIELDS TERMINATED BY '\t'
@@ -215,8 +221,8 @@ def cluster_ids(clustered_dupes):
     'classification'='csv', 
     --'skip.header.line.count'='1',  
     'serialization.null.format'='')
-'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'blocking_map') 
-utils.athena_start_query(q)
+""".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map') 
+athenautils.athena_start_query(q, database=config.DATABASE)
 
 
 # In[ ]:
@@ -229,13 +235,12 @@ def cluster_ids(clustered_dupes):
 # Armin: 
 # This never runs, index_fields is empty, possible bug?
 for field in deduper.fingerprinter.index_fields:
-    q = '''
-    SELECT DISTINCT {field} FROM processed_donors 
+    q = """
+    SELECT DISTINCT {field} FROM as_processed_donors
     WHERE {field} IS NOT NULL
-    '''.format(field=field)
-    cur_df = as_pandas(q)
-    # Do I need to cast it as a list?
-    field_data = cur_df[field]
+    """.format(field=field)
+    cur = cur_execute(q)
+    field_data = (row[field] for row in cur)
     deduper.fingerprinter.index(field_data, field)
  
 
@@ -247,16 +252,16 @@ def cluster_ids(clustered_dupes):
 # generator that yields unique `(block_key, donor_id)` tuples.
 print('writing blocking map')
 
-
-read_cur_dict = as_pandas(DONOR_SELECT).to_dict('records')
-full_data = ((row['donor_id'], row) for row in read_cur_dict)
+read_cur  = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE)
+full_data = ((row['donor_id'], row) for row in read_cur)
 
 
 # In[ ]:
 
 
 b_data = deduper.fingerprinter(full_data)
-buffer = pd.DataFrame.from_records(b_data).to_csv(index=False, header=False, sep='\t')    utils.s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'blocking_map/blocking.csv', Body=buffer)    
+athenautils.write_many(b_data, 
+                       filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map/blocking.csv'))
 
 
 # In[ ]:
@@ -264,24 +269,24 @@ def cluster_ids(clustered_dupes):
 
 
     # select unique pairs to compare
-    q='''
-    SELECT a.donor_id,
-        json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'],
-                              ARRAY[ a.city, a.name, a.zip, a.state, a.address])
-                    AS JSON)),
-        b.donor_id,
-        json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], 
-                  ARRAY[ b.city, b.name, b.zip, b.state, b.address])
-              AS JSON))
-    FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west
-         from blocking_map as l
-         INNER JOIN blocking_map as r
-         using (block_key)
-         where l.donor_id < r.donor_id) ids
-    INNER JOIN processed_donors a on ids.east=a.donor_id
-    INNER JOIN processed_donors b on ids.west=b.donor_id
-    '''
-    read_cur_dict=as_pandas(q).itertuples(index=False, name=None)
+    q="""
+        SELECT a.donor_id,
+            json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'],
+                                  ARRAY[ a.city, a.name, a.zip, a.state, a.address])
+                        AS JSON)),
+            b.donor_id,
+            json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], 
+                      ARRAY[ b.city, b.name, b.zip, b.state, b.address])
+                  AS JSON))
+        FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west
+             from as_blocking_map as l
+             INNER JOIN as_blocking_map as r
+             using (block_key)
+             where l.donor_id < r.donor_id) ids
+        INNER JOIN as_processed_donors a on ids.east=a.donor_id
+        INNER JOIN as_processed_donors b on ids.west=b.donor_id
+       """
+    read_cur = cursor_execute(q, database=config.DATABASE)
 
 
 # In[ ]:
@@ -290,34 +295,37 @@ def cluster_ids(clustered_dupes):
 # ## Clustering
 
 print('clustering...')
-clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur_dict)),
+clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)),
                                   threshold=0.5)
 
 
 # In[ ]:
 
 
-utils.athena_start_query("DROP TABLE IF EXISTS entity_map")
-
-print('creating entity_map database')
-q='''
-CREATE EXTERNAL TABLE entity_map     
-    (donor_id INTEGER, canon_id INTEGER, 
-     cluster_score FLOAT)
-ROW FORMAT DELIMITED
-  FIELDS TERMINATED BY '\t'
-  LINES TERMINATED BY '\n'  
-LOCATION
-    's3://{}/{}' 
-TBLPROPERTIES (
-    'classification'='csv', 
-    --'skip.header.line.count'='1',  
-    'serialization.null.format'='')
-'''.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'entity_map') 
-utils.athena_start_query(q) 
-
-buffer = pd.DataFrame.from_records(cluster_ids(clustered_dupes)).to_csv(index=False, header=False, sep='\t')
-utils.s3.put_object(Bucket=config.DATABASE_BUCKET, Key=config.DATABASE_ROOT_KEY+'entity_map/entity_map.csv', Body=buffer)    
+#     athenautils.athena_start_query("DROP TABLE IF EXISTS as_entity_map", database=config.DATABASE)
+    athenautils.drop_external_table("as_entity_map", 
+                                    location='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/'), 
+                                    database=config.DATABASE)
+    
+    print('creating as_entity_map database')
+    q="""
+    CREATE EXTERNAL TABLE as_entity_map     
+        (donor_id INTEGER, canon_id INTEGER, 
+         cluster_score FLOAT)
+    ROW FORMAT DELIMITED
+      FIELDS TERMINATED BY '\t'
+      LINES TERMINATED BY '\n'  
+    LOCATION
+        's3://{}/{}' 
+    TBLPROPERTIES (
+        'classification'='csv', 
+        --'skip.header.line.count'='1',  
+        'serialization.null.format'='')
+    """.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map') 
+    athenautils.athena_start_query(q, database=config.DATABASE) 
+
+    athenautils.write_many(cluster_ids(clustered_dupes),
+                          filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/entity_map.csv'))
 
 
 # In[ ]:
@@ -335,70 +343,65 @@ def cluster_ids(clustered_dupes):
 
 locale.setlocale(locale.LC_ALL, 'en_CA.UTF-8')  # for pretty printing numbers
 
-utils.athena_start_query("DROP TABLE IF EXISTS e_map")
-q = '''
-CREATE TABLE e_map as 
-    SELECT COALESCE(canon_id, entity_map.donor_id) AS canon_id, entity_map.donor_id 
-    FROM entity_map 
-        RIGHT JOIN donors USING(donor_id)
-'''
-
-utils.athena_start_query(q)
-q ='''
-SELECT array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name,   
-    donation_totals.totals AS totals 
-FROM donors INNER JOIN 
-    (SELECT canon_id, SUM(cast (amount as double)) AS totals 
-    FROM contributions INNER JOIN e_map 
-    USING (donor_id) 
-    GROUP BY (canon_id) 
-    ORDER BY totals 
-    DESC LIMIT 10) 
-    AS donation_totals 
-ON donors.donor_id = donation_totals.canon_id
-ORDER BY totals DESC
-'''
-cur_dict = as_pandas(q).to_dict('records')
+athenautils.athena_start_query("DROP TABLE IF EXISTS as_e_map", database=config.DATABASE)
+
+q = """
+    CREATE TABLE as_e_map as 
+    SELECT COALESCE(canon_id, as_entity_map.donor_id) AS canon_id, as_entity_map.donor_id 
+    FROM as_entity_map 
+    RIGHT JOIN as_donors USING(donor_id)        
+    """    
+athenautils.athena_start_query(q, database=config.DATABASE)
+
+q = """
+    SELECT array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,   
+        donation_totals.totals AS totals 
+    FROM as_donors INNER JOIN 
+        (SELECT canon_id, SUM(cast (amount as double)) AS totals 
+        FROM as_contributions INNER JOIN as_e_map 
+        USING (donor_id) 
+        GROUP BY (canon_id) 
+        ORDER BY totals 
+        DESC LIMIT 10) 
+        AS donation_totals 
+    ON as_donors.donor_id = donation_totals.canon_id
+    ORDER BY totals DESC
+"""
+cur = dict_cursor_execute(q, database=config.DATABASE)
 
 print("Top Donors (deduped)")
-for row in cur_dict:
+for row in cur:
     row['totals'] = locale.currency(row['totals'], grouping=True)
     print('%(totals)20s: %(name)s' % row)
 
 # Compare this to what we would have gotten if we hadn't done any
 # deduplication
-
-q = '''
-with donorscontributions as(
-
-    SELECT donors.donor_id, 
-        array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name,
-        cast(contributions.amount as double) as amount
-    FROM donors INNER JOIN contributions 
-        USING (donor_id) 
-)
-SELECT name, sum(amount) AS totals  
-FROM donorscontributions
-GROUP BY donor_id, name
-ORDER BY totals DESC 
-LIMIT 10
-'''
-
-cur_dict = as_pandas(q).to_dict('records')
+q = """
+    with donorscontributions as(
+
+        SELECT as_donors.donor_id, 
+            array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,
+            cast(as_contributions.amount as double) as amount
+        FROM as_donors INNER JOIN as_contributions 
+            USING (donor_id) 
+        )
+    SELECT name, sum(amount) AS totals  
+    FROM donorscontributions
+    GROUP BY donor_id, name
+    ORDER BY totals DESC 
+    LIMIT 10
+"""
+cur = dict_cursor_execute(q, database=config.DATABASE)
 
 print("Top Donors (raw)")
-for row in cur_dict:
+for row in cur:
     row['totals'] = locale.currency(row['totals'], grouping=True)
     print('%(totals)20s: %(name)s' % row)
 
-# Close our database connection
-#     read_con.close()
-#     write_con.close()
-
 print('ran in', time.time() - start_time, 'seconds')
 
 
-# In[9]:
+# In[ ]:
 
 
 get_ipython().system('jupyter nbconvert --to script athena_example.ipynb --output-dir=../athena_example/')
diff --git a/athena_example/athena_init.py b/athena_example/athena_init.py
index c8b5b3ea..45a5e254 100644
--- a/athena_example/athena_init.py
+++ b/athena_example/athena_init.py
@@ -51,7 +51,9 @@
 
 
 print('importing raw data from csv...')
-athenautils.athena_start_query("DROP TABLE IF EXISTS as_raw_table", database=config.DATABASE)
+athenautils.drop_external_table("as_raw_table", 
+                                location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table'),
+                                database=config.DATABASE)    
 athenautils.athena_start_query("DROP TABLE IF EXISTS as_donors", database=config.DATABASE)
 athenautils.athena_start_query("DROP TABLE IF EXISTS as_recipients", database=config.DATABASE)
 athenautils.athena_start_query("DROP TABLE IF EXISTS as_contributions", database=config.DATABASE)
@@ -87,39 +89,40 @@
 athenautils.athena_start_query(q, database=config.DATABASE)
 
 
-df = pd.read_csv(contributions_txt_file, sep='\t', escapechar='\\', quoting=csv.QUOTE_NONE,  
-                 error_bad_lines=False, warn_bad_lines=True, dtype=str, keep_default_na=False, na_values=[''])#,
-
-# Remove the very few records that mess up the demo 
-# (demo purposes only! Don't do something like this in production)
-df = df[df['RcvDate'].str.len()>=10]
-
-# set empty, non-zero, strings in date columns to null
-df.loc[df['RptPdBegDate'].str.len()<10,'RptPdBegDate'] = np.nan
-
-df.loc[df['RptPdEndDate'].str.len()<10,'RptPdEndDate'] = np.nan
-
-#committee ID is requred. Remove the 2 rows that don't have it.
-df = df[df['ID']!='']
-
-# There's a record with a date stuck in the committee_id column, which causes
-# problems when inserting into the contributions table below. Get rid of it this 
-# way.
-df = df[df['ID'].str.len() <=9]
-
-# dropping the last columns
-df = df.drop(columns='Unnamed: 29')
-
-# Nullifying empty strings
-# df = df.replace(r'^\s*$', np.nan, regex=True)
-df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand')
-
-athenautils.write(body=df_lower.to_csv(quoting=csv.QUOTE_NONE, sep="\t", escapechar='\\', index=None),
-           filename=os.path.join("s3://", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'as_raw_table', contributions_txt_file,))
-
-# Athena is doesn't equate empty string and null, eventhough in the table spec we said so
-# Not that it's a bug, it works if the string is null in the source, but not after applying trim to it
-# So we need to manually take care of that
+df_cursor = pd.read_csv(contributions_txt_file, sep='\t', escapechar='\\', quoting=csv.QUOTE_NONE,  
+                        error_bad_lines=False, warn_bad_lines=True, dtype=str, keep_default_na=False, na_values=[''],
+                        chunksize=config.BUFFERSIZE)
+chunkcount = 0
+filename=os.path.join("s3://", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'as_raw_table', os.path.splitext(contributions_txt_file)[0]+'.csv')
+for df in df_cursor: 
+    # Remove the very few records that mess up the demo 
+    # (demo purposes only! Don't do something like this in production)
+    df = df[df['RcvDate'].str.len()>=10]
+
+    # set empty, non-zero, strings in date columns to null
+    df.loc[df['RptPdBegDate'].str.len()<10,'RptPdBegDate'] = np.nan
+
+    df.loc[df['RptPdEndDate'].str.len()<10,'RptPdEndDate'] = np.nan
+
+    #committee ID is requred. Remove the 2 rows that don't have it.
+    df = df[df['ID']!='']
+
+    # There's a record with a date stuck in the committee_id column, which causes
+    # problems when inserting into the contributions table below. Get rid of it this 
+    # way.
+    df = df[df['ID'].str.len() <=9]
+
+    # dropping the last columns
+    df = df.drop(columns='Unnamed: 29')
+
+    df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand')
+    
+    buffer = df_lower.to_csv(quoting=csv.QUOTE_NONE, sep="\t", escapechar='\\', index=None)
+    
+    chunk_fname = athenautils.file_name_append(filename, '_{}'.format(chunkcount), ommitext=False)
+    athenautils.write(body=buffer, filename=chunk_fname)
+    chunkcount += 1    
+    
 print('creating donors table...')
 q="""
 CREATE TABLE as_donors as
@@ -147,22 +150,6 @@
 
 print('creating contributions table')
 
-# --
-# c.execute("CREATE TABLE contributions "
-#           "(contribution_id INT, donor_id INT, recipient_id INT, "
-#           " report_type VARCHAR(24), date_recieved DATE, "
-#           " loan_amount VARCHAR(12), amount VARCHAR(23), "
-#           " receipt_type VARCHAR(23), "
-#           " vendor_last_name VARCHAR(70), "
-#           " vendor_first_name VARCHAR(20), "
-#           " vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), "
-#           " vendor_city VARCHAR(20), vendor_state VARCHAR(10), "
-#           " vendor_zip VARCHAR(10), description VARCHAR(90), "
-#           " election_type VARCHAR(10), election_year VARCHAR(10), "
-#           " report_period_begin DATE, report_period_end DATE) "
-#           "CHARACTER SET utf8 COLLATE utf8_unicode_ci")
-# --
-
 q="""
 CREATE TABLE as_contributions as
     SELECT reciept_id as contribution_id, 
diff --git a/athena_example/athenautils.py b/athena_example/athenautils.py
index 3cd8e4dd..a88463e2 100644
--- a/athena_example/athenautils.py
+++ b/athena_example/athenautils.py
@@ -27,7 +27,7 @@
 athena = boto3.client('athena', region_name=config.REGION, 
                       aws_access_key_id=config.ACCESS_KEY_ID, aws_secret_access_key=config.SECRET_ACCESS_KEY)
 
-def cursor_execute(query, database=None, cursortype='dict', buffersize=config.BUFFERSIZE, 
+def cursor_execute(query, database=None, cursortype='tuple', buffersize=1000000, 
                    output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, 
                    **kwargs):
     
@@ -104,10 +104,36 @@ def list_all(path):
     if is_s3_url(path):
         bucket, key = seperate_bucket_key(path)
         objects = s3.list_objects_v2(Bucket=bucket, Prefix=key)
+        if not 'Contents' in objects:
+            return []
         return [key['Key'] for key in objects['Contents']]
     from os import listdir
     from os.path import isfile, join
+    if not os.path.exists(path):
+        return []
     return listdir(path)
+
+def del_all_files(path):
+    filelist = list_all(path)
+    if is_s3_url(path):
+        bucket, key = seperate_bucket_key(path)
+        for f in filelist:
+            s3.delete_object(Bucket=bucket, Key=f)    
+        return
+    filelist = [os.path.join(path, f) for f in filelist]
+    for f in filelist:
+        if os.path.isfile(f):
+            os.remove(f)
+        else:    
+            shutil.rmtree(f)  
+            
+def drop_external_table(tablename, location , database=None, 
+                        output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP):
+    athena_start_query('drop table if exists {}'.format(tablename), database=database, 
+                       output_location=output_location, region=region, workgroup=workgroup)
+    del_all_files(location)
+
+
     
 
 def pandas_read_csv(filepath_or_buffer, **kwargs):
diff --git a/athena_example/config.py b/athena_example/config.py
index f8e4a24b..9808a709 100644
--- a/athena_example/config.py
+++ b/athena_example/config.py
@@ -1,4 +1,5 @@
 LOG_FILE = 'log.txt'
+
 # Connection parameters
 ACCESS_KEY_ID = None
 SECRET_ACCESS_KEY = None
diff --git a/notebooks/athena_example.ipynb b/notebooks/athena_example.ipynb
index e3a0e7b7..e452939c 100644
--- a/notebooks/athena_example.ipynb
+++ b/notebooks/athena_example.ipynb
@@ -1,61 +1,8 @@
 {
  "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Note: \n",
-    "Looks good, but check the sanity check notebook to makesure everything is correct"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: dedupe in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (2.0.6)\n",
-      "Requirement already satisfied: categorical-distance>=1.9 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.9)\n",
-      "Requirement already satisfied: dedupe-variable-datetime in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (0.1.5)\n",
-      "Requirement already satisfied: affinegap>=1.3 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.11)\n",
-      "Requirement already satisfied: highered>=0.2.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (0.2.1)\n",
-      "Requirement already satisfied: typing-extensions in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (3.7.4.3)\n",
-      "Requirement already satisfied: simplecosine>=1.2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.2)\n",
-      "Requirement already satisfied: doublemetaphone in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (0.1)\n",
-      "Requirement already satisfied: fastcluster in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.1.26)\n",
-      "Requirement already satisfied: rlr>=2.4.3 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (2.4.5)\n",
-      "Requirement already satisfied: haversine>=0.4.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (2.3.0)\n",
-      "Requirement already satisfied: BTrees>=4.1.4 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (4.7.2)\n",
-      "Requirement already satisfied: numpy>=1.13 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.18.1)\n",
-      "Requirement already satisfied: zope.index in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (5.0.0)\n",
-      "Requirement already satisfied: dedupe-hcluster in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (0.3.8)\n",
-      "Requirement already satisfied: Levenshtein-search in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe) (1.4.5)\n",
-      "Requirement already satisfied: future in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe-variable-datetime->dedupe) (0.18.2)\n",
-      "Requirement already satisfied: datetime-distance in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from dedupe-variable-datetime->dedupe) (0.1.3)\n",
-      "Requirement already satisfied: pyhacrf-datamade>=0.2.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from highered>=0.2.0->dedupe) (0.2.5)\n",
-      "Requirement already satisfied: pylbfgs in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from rlr>=2.4.3->dedupe) (0.2.0.13)\n",
-      "Requirement already satisfied: zope.interface in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from BTrees>=4.1.4->dedupe) (5.1.2)\n",
-      "Requirement already satisfied: persistent>=4.1.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from BTrees>=4.1.4->dedupe) (4.6.4)\n",
-      "Requirement already satisfied: six in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from zope.index->dedupe) (1.14.0)\n",
-      "Requirement already satisfied: setuptools in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from zope.index->dedupe) (45.2.0.post20200210)\n",
-      "Requirement already satisfied: python-dateutil>=2.6.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from datetime-distance->dedupe-variable-datetime->dedupe) (2.8.1)\n",
-      "Requirement already satisfied: cffi; platform_python_implementation == \"CPython\" in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from persistent>=4.1.0->BTrees>=4.1.4->dedupe) (1.14.0)\n",
-      "Requirement already satisfied: pycparser in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from cffi; platform_python_implementation == \"CPython\"->persistent>=4.1.0->BTrees>=4.1.4->dedupe) (2.19)\n",
-      "\u001b[33mWARNING: You are using pip version 20.0.2; however, version 20.2.4 is available.\n",
-      "You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.\u001b[0m\n"
-     ]
-    }
-   ],
-   "source": [
-    "!pip install dedupe"
-   ]
-  },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -101,10 +48,21 @@
     "sys.path.insert(0, '../athena_example/')\n",
     "import athenautils\n",
     "\n",
-    "def as_pandas(query, **kwrgs):\n",
-    "    df = athenautils.athena_to_panda(query, escapechar=None, keep_default_na=False, na_values=[''], **kwrgs)\n",
-    "    return df.where(pd.notnull(df), None)\n",
-    "\n",
+    "def cursor_execute(query, database):\n",
+    "    '''\n",
+    "    The MySQL compatible Cursor\n",
+    "    '''\n",
+    "    return athenautils.cursor_execute(query, database=database, \n",
+    "                                      cursortype='tuple', buffersize=config.BUFFERSIZE,\n",
+    "                                      escapechar=None, keep_default_na=False, na_values=[''])\n",
+    "\n",
+    "def dict_cursor_execute(query, database):\n",
+    "    '''\n",
+    "    The MySQL compatible DicCursor\n",
+    "    '''\n",
+    "    return athenautils.cursor_execute(query, database=database, \n",
+    "                                      cursortype='dict', buffersize=config.BUFFERSIZE,\n",
+    "                                      escapechar=None, keep_default_na=False, na_values=[''])\n",
     "def record_pairs(result_set):\n",
     "    for i, row in enumerate(result_set):\n",
     "        a_record_id, a_record, b_record_id, b_record = row\n",
@@ -125,26 +83,25 @@
     "            yield donor_id, cluster_id, score\n",
     "\n",
     "\n",
-    "# if __name__ == '__main__':\n",
-    "if True:\n",
+    "if __name__ == '__main__':\n",
     "\n",
-    "    # ## Logging\n",
+    "    ## Logging\n",
     "\n",
     "    # Dedupe uses Python logging to show or suppress verbose output. Added\n",
     "    # for convenience.  To enable verbose output, run `python\n",
     "    # examples/mysql_example/mysql_example.py -v`\n",
     "    \n",
-    "#     optp = optparse.OptionParser()\n",
-    "#     optp.add_option('-v', '--verbose', dest='verbose', action='count',\n",
-    "#                     help='Increase verbosity (specify multiple times for more)'\n",
-    "#                     )\n",
-    "#     (opts, args) = optp.parse_args()\n",
+    "    optp = optparse.OptionParser()\n",
+    "    optp.add_option('-v', '--verbose', dest='verbose', action='count',\n",
+    "                    help='Increase verbosity (specify multiple times for more)'\n",
+    "                    )\n",
+    "    (opts, args) = optp.parse_args()\n",
     "    log_level = logging.WARNING\n",
-    "#     if opts.verbose:\n",
-    "#         if opts.verbose == 1:\n",
-    "#             log_level = logging.INFO\n",
-    "#         elif opts.verbose >= 2:\n",
-    "#             log_level = logging.DEBUG\n",
+    "    if opts.verbose:\n",
+    "        if opts.verbose == 1:\n",
+    "            log_level = logging.INFO\n",
+    "        elif opts.verbose >= 2:\n",
+    "            log_level = logging.DEBUG\n",
     "\n",
     "\n",
     "    logging.getLogger().setLevel(log_level)\n",
@@ -160,17 +117,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "reading from  mysql_example_settings\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "    # We'll be using variations on this following select statement to pull\n",
     "    # in campaign donor info.\n",
@@ -253,28 +202,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "blocking...\n",
-      "creating as_blocking_map database\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "'5651b314-d20b-4404-aa8d-30df70804e0e'"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "    # ## Blocking\n",
     "\n",
@@ -283,7 +213,9 @@
     "    # To run blocking on such a large set of data, we create a separate table\n",
     "    # that contains blocking keys and record ids\n",
     "    print('creating as_blocking_map database')\n",
-    "    athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_blocking_map\", database=config.DATABASE)\n",
+    "    athenautils.drop_external_table(\"as_blocking_map\", \n",
+    "                                    location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map'),\n",
+    "                                    database=config.DATABASE)\n",
     "\n",
     "    q=\"\"\"\n",
     "    CREATE EXTERNAL TABLE as_blocking_map     \n",
@@ -303,17 +235,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "creating inverted index\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "    # If dedupe learned a Index Predicate, we have to take a pass\n",
     "    # through the data and create indices.\n",
@@ -334,137 +258,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "writing blocking map\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "    # Now we are ready to write our blocking map table by creating a\n",
     "    # generator that yields unique `(block_key, donor_id)` tuples.\n",
     "    print('writing blocking map')\n",
     "    \n",
-    "    read_cur  = athenautils.cursor_execute(DONOR_SELECT, database=config.DATABASE)\n",
+    "    read_cur  = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE)\n",
     "    full_data = ((row['donor_id'], row) for row in read_cur)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_0.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_1.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_2.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_3.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_4.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_5.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_6.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_7.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_8.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_9.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_10.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_11.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_12.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_13.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_14.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_15.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_16.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_17.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_18.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_19.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_20.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_21.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_22.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_23.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_24.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_25.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_26.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_27.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_28.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_29.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_30.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_31.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_32.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_33.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_34.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_35.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_36.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_37.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_38.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_39.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_40.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_41.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_42.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_43.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_44.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_45.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_46.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_47.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_48.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_49.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_50.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_51.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_52.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_53.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_54.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_55.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_56.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_57.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_58.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_59.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_60.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_61.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_62.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_63.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_64.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_65.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_66.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_67.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_68.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_69.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_70.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_71.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_72.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_73.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_74.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_75.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_76.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_77.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_78.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_79.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_80.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_81.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_82.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_83.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_84.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_85.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_86.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_87.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_88.csv\n",
-      "s3://ria-temp/as_dedupe/as_blocking_map/blocking_89.csv\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "    b_data = deduper.fingerprinter(full_data)\n",
     "    athenautils.write_many(b_data, \n",
-    "                           filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY, 'as_blocking_map/blocking.csv'))"
+    "                           filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map/blocking.csv'))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -487,535 +306,14 @@
     "        INNER JOIN as_processed_donors a on ids.east=a.donor_id\n",
     "        INNER JOIN as_processed_donors b on ids.west=b.donor_id\n",
     "       \"\"\"\n",
-    "    read_cur = athenautils.cursor_execute(q, cursortype='tuple', database=config.DATABASE)\n"
+    "    read_cur = cursor_execute(q, database=config.DATABASE)\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "clustering...\n",
-      "0\n",
-      "10000\n",
-      "20000\n",
-      "30000\n",
-      "40000\n",
-      "50000\n",
-      "60000\n",
-      "70000\n",
-      "80000\n",
-      "90000\n",
-      "100000\n",
-      "110000\n",
-      "120000\n",
-      "130000\n",
-      "140000\n",
-      "150000\n",
-      "160000\n",
-      "170000\n",
-      "180000\n",
-      "190000\n",
-      "200000\n",
-      "210000\n",
-      "220000\n",
-      "230000\n",
-      "240000\n",
-      "250000\n",
-      "260000\n",
-      "270000\n",
-      "280000\n",
-      "290000\n",
-      "300000\n",
-      "310000\n",
-      "320000\n",
-      "330000\n",
-      "340000\n",
-      "350000\n",
-      "360000\n",
-      "370000\n",
-      "380000\n",
-      "390000\n",
-      "400000\n",
-      "410000\n",
-      "420000\n",
-      "430000\n",
-      "440000\n",
-      "450000\n",
-      "460000\n",
-      "470000\n",
-      "480000\n",
-      "490000\n",
-      "500000\n",
-      "510000\n",
-      "520000\n",
-      "530000\n",
-      "540000\n",
-      "550000\n",
-      "560000\n",
-      "570000\n",
-      "580000\n",
-      "590000\n",
-      "600000\n",
-      "610000\n",
-      "620000\n",
-      "630000\n",
-      "640000\n",
-      "650000\n",
-      "660000\n",
-      "670000\n",
-      "680000\n",
-      "690000\n",
-      "700000\n",
-      "710000\n",
-      "720000\n",
-      "730000\n",
-      "740000\n",
-      "750000\n",
-      "760000\n",
-      "770000\n",
-      "780000\n",
-      "790000\n",
-      "800000\n",
-      "810000\n",
-      "820000\n",
-      "830000\n",
-      "840000\n",
-      "850000\n",
-      "860000\n",
-      "870000\n",
-      "880000\n",
-      "890000\n",
-      "900000\n",
-      "910000\n",
-      "920000\n",
-      "930000\n",
-      "940000\n",
-      "950000\n",
-      "960000\n",
-      "970000\n",
-      "980000\n",
-      "990000\n",
-      "1000000\n",
-      "1010000\n",
-      "1020000\n",
-      "1030000\n",
-      "1040000\n",
-      "1050000\n",
-      "1060000\n",
-      "1070000\n",
-      "1080000\n",
-      "1090000\n",
-      "1100000\n",
-      "1110000\n",
-      "1120000\n",
-      "1130000\n",
-      "1140000\n",
-      "1150000\n",
-      "1160000\n",
-      "1170000\n",
-      "1180000\n",
-      "1190000\n",
-      "1200000\n",
-      "1210000\n",
-      "1220000\n",
-      "1230000\n",
-      "1240000\n",
-      "1250000\n",
-      "1260000\n",
-      "1270000\n",
-      "1280000\n",
-      "1290000\n",
-      "1300000\n",
-      "1310000\n",
-      "1320000\n",
-      "1330000\n",
-      "1340000\n",
-      "1350000\n",
-      "1360000\n",
-      "1370000\n",
-      "1380000\n",
-      "1390000\n",
-      "1400000\n",
-      "1410000\n",
-      "1420000\n",
-      "1430000\n",
-      "1440000\n",
-      "1450000\n",
-      "1460000\n",
-      "1470000\n",
-      "1480000\n",
-      "1490000\n",
-      "1500000\n",
-      "1510000\n",
-      "1520000\n",
-      "1530000\n",
-      "1540000\n",
-      "1550000\n",
-      "1560000\n",
-      "1570000\n",
-      "1580000\n",
-      "1590000\n",
-      "1600000\n",
-      "1610000\n",
-      "1620000\n",
-      "1630000\n",
-      "1640000\n",
-      "1650000\n",
-      "1660000\n",
-      "1670000\n",
-      "1680000\n",
-      "1690000\n",
-      "1700000\n",
-      "1710000\n",
-      "1720000\n",
-      "1730000\n",
-      "1740000\n",
-      "1750000\n",
-      "1760000\n",
-      "1770000\n",
-      "1780000\n",
-      "1790000\n",
-      "1800000\n",
-      "1810000\n",
-      "1820000\n",
-      "1830000\n",
-      "1840000\n",
-      "1850000\n",
-      "1860000\n",
-      "1870000\n",
-      "1880000\n",
-      "1890000\n",
-      "1900000\n",
-      "1910000\n",
-      "1920000\n",
-      "1930000\n",
-      "1940000\n",
-      "1950000\n",
-      "1960000\n",
-      "1970000\n",
-      "1980000\n",
-      "1990000\n",
-      "2000000\n",
-      "2010000\n",
-      "2020000\n",
-      "2030000\n",
-      "2040000\n",
-      "2050000\n",
-      "2060000\n",
-      "2070000\n",
-      "2080000\n",
-      "2090000\n",
-      "2100000\n",
-      "2110000\n",
-      "2120000\n",
-      "2130000\n",
-      "2140000\n",
-      "2150000\n",
-      "2160000\n",
-      "2170000\n",
-      "2180000\n",
-      "2190000\n",
-      "2200000\n",
-      "2210000\n",
-      "2220000\n",
-      "2230000\n",
-      "2240000\n",
-      "2250000\n",
-      "2260000\n",
-      "2270000\n",
-      "2280000\n",
-      "2290000\n",
-      "2300000\n",
-      "2310000\n",
-      "2320000\n",
-      "2330000\n",
-      "2340000\n",
-      "2350000\n",
-      "2360000\n",
-      "2370000\n",
-      "2380000\n",
-      "2390000\n",
-      "2400000\n",
-      "2410000\n",
-      "2420000\n",
-      "2430000\n",
-      "2440000\n",
-      "2450000\n",
-      "2460000\n",
-      "2470000\n",
-      "2480000\n",
-      "2490000\n",
-      "2500000\n",
-      "2510000\n",
-      "2520000\n",
-      "2530000\n",
-      "2540000\n",
-      "2550000\n",
-      "2560000\n",
-      "2570000\n",
-      "2580000\n",
-      "2590000\n",
-      "2600000\n",
-      "2610000\n",
-      "2620000\n",
-      "2630000\n",
-      "2640000\n",
-      "2650000\n",
-      "2660000\n",
-      "2670000\n",
-      "2680000\n",
-      "2690000\n",
-      "2700000\n",
-      "2710000\n",
-      "2720000\n",
-      "2730000\n",
-      "2740000\n",
-      "2750000\n",
-      "2760000\n",
-      "2770000\n",
-      "2780000\n",
-      "2790000\n",
-      "2800000\n",
-      "2810000\n",
-      "2820000\n",
-      "2830000\n",
-      "2840000\n",
-      "2850000\n",
-      "2860000\n",
-      "2870000\n",
-      "2880000\n",
-      "2890000\n",
-      "2900000\n",
-      "2910000\n",
-      "2920000\n",
-      "2930000\n",
-      "2940000\n",
-      "2950000\n",
-      "2960000\n",
-      "2970000\n",
-      "2980000\n",
-      "2990000\n",
-      "3000000\n",
-      "3010000\n",
-      "3020000\n",
-      "3030000\n",
-      "3040000\n",
-      "3050000\n",
-      "3060000\n",
-      "3070000\n",
-      "3080000\n",
-      "3090000\n",
-      "3100000\n",
-      "3110000\n",
-      "3120000\n",
-      "3130000\n",
-      "3140000\n",
-      "3150000\n",
-      "3160000\n",
-      "3170000\n",
-      "3180000\n",
-      "3190000\n",
-      "3200000\n",
-      "3210000\n",
-      "3220000\n",
-      "3230000\n",
-      "3240000\n",
-      "3250000\n",
-      "3260000\n",
-      "3270000\n",
-      "3280000\n",
-      "3290000\n",
-      "3300000\n",
-      "3310000\n",
-      "3320000\n",
-      "3330000\n",
-      "3340000\n",
-      "3350000\n",
-      "3360000\n",
-      "3370000\n",
-      "3380000\n",
-      "3390000\n",
-      "3400000\n",
-      "3410000\n",
-      "3420000\n",
-      "3430000\n",
-      "3440000\n",
-      "3450000\n",
-      "3460000\n",
-      "3470000\n",
-      "3480000\n",
-      "3490000\n",
-      "3500000\n",
-      "3510000\n",
-      "3520000\n",
-      "3530000\n",
-      "3540000\n",
-      "3550000\n",
-      "3560000\n",
-      "3570000\n",
-      "3580000\n",
-      "3590000\n",
-      "3600000\n",
-      "3610000\n",
-      "3620000\n",
-      "3630000\n",
-      "3640000\n",
-      "3650000\n",
-      "3660000\n",
-      "3670000\n",
-      "3680000\n",
-      "3690000\n",
-      "3700000\n",
-      "3710000\n",
-      "3720000\n",
-      "3730000\n",
-      "3740000\n",
-      "3750000\n",
-      "3760000\n",
-      "3770000\n",
-      "3780000\n",
-      "3790000\n",
-      "3800000\n",
-      "3810000\n",
-      "3820000\n",
-      "3830000\n",
-      "3840000\n",
-      "3850000\n",
-      "3860000\n",
-      "3870000\n",
-      "3880000\n",
-      "3890000\n",
-      "3900000\n",
-      "3910000\n",
-      "3920000\n",
-      "3930000\n",
-      "3940000\n",
-      "3950000\n",
-      "3960000\n",
-      "3970000\n",
-      "3980000\n",
-      "3990000\n",
-      "4000000\n",
-      "4010000\n",
-      "4020000\n",
-      "4030000\n",
-      "4040000\n",
-      "4050000\n",
-      "4060000\n",
-      "4070000\n",
-      "4080000\n",
-      "4090000\n",
-      "4100000\n",
-      "4110000\n",
-      "4120000\n",
-      "4130000\n",
-      "4140000\n",
-      "4150000\n",
-      "4160000\n",
-      "4170000\n",
-      "4180000\n",
-      "4190000\n",
-      "4200000\n",
-      "4210000\n",
-      "4220000\n",
-      "4230000\n",
-      "4240000\n",
-      "4250000\n",
-      "4260000\n",
-      "4270000\n",
-      "4280000\n",
-      "4290000\n",
-      "4300000\n",
-      "4310000\n",
-      "4320000\n",
-      "4330000\n",
-      "4340000\n",
-      "4350000\n",
-      "4360000\n",
-      "4370000\n",
-      "4380000\n",
-      "4390000\n",
-      "4400000\n",
-      "4410000\n",
-      "4420000\n",
-      "4430000\n",
-      "4440000\n",
-      "4450000\n",
-      "4460000\n",
-      "4470000\n",
-      "4480000\n",
-      "4490000\n",
-      "4500000\n",
-      "4510000\n",
-      "4520000\n",
-      "4530000\n",
-      "4540000\n",
-      "4550000\n",
-      "4560000\n",
-      "4570000\n",
-      "4580000\n",
-      "4590000\n",
-      "4600000\n",
-      "4610000\n",
-      "4620000\n",
-      "4630000\n",
-      "4640000\n",
-      "4650000\n",
-      "4660000\n",
-      "4670000\n",
-      "4680000\n",
-      "4690000\n",
-      "4700000\n",
-      "4710000\n",
-      "4720000\n",
-      "4730000\n",
-      "4740000\n",
-      "4750000\n",
-      "4760000\n",
-      "4770000\n",
-      "4780000\n",
-      "4790000\n",
-      "4800000\n",
-      "4810000\n",
-      "4820000\n",
-      "4830000\n",
-      "4840000\n",
-      "4850000\n",
-      "4860000\n",
-      "4870000\n",
-      "4880000\n",
-      "4890000\n",
-      "4900000\n",
-      "4910000\n",
-      "4920000\n",
-      "4930000\n",
-      "4940000\n",
-      "4950000\n",
-      "4960000\n",
-      "4970000\n",
-      "4980000\n",
-      "4990000\n",
-      "5000000\n",
-      "5010000\n",
-      "5020000\n",
-      "5030000\n",
-      "5040000\n",
-      "5050000\n",
-      "5060000\n",
-      "5070000\n",
-      "5080000\n",
-      "5090000\n",
-      "5100000\n",
-      "5110000\n",
-      "5120000\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "    # ## Clustering\n",
     "\n",
@@ -1026,79 +324,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "creating as_entity_map database\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "WARNING:dedupe.clustering:A component contained 158378 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 3.048800940974382e-18\n",
-      "WARNING:dedupe.clustering:A component contained 158378 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.1038451096258602e-17\n",
-      "WARNING:dedupe.clustering:A component contained 158378 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 4.921472802362437e-17\n",
-      "WARNING:dedupe.clustering:A component contained 158376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.3509749469939632e-16\n",
-      "WARNING:dedupe.clustering:A component contained 158376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 3.753198598980096e-16\n",
-      "WARNING:dedupe.clustering:A component contained 158376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.0403512617910142e-15\n",
-      "WARNING:dedupe.clustering:A component contained 158376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 2.894390795098561e-15\n",
-      "WARNING:dedupe.clustering:A component contained 158376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 8.13221083415663e-15\n",
-      "WARNING:dedupe.clustering:A component contained 158376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 2.2319167959587083e-14\n",
-      "WARNING:dedupe.clustering:A component contained 158375 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 6.068761194789993e-14\n",
-      "WARNING:dedupe.clustering:A component contained 158372 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.6520029686587212e-13\n",
-      "WARNING:dedupe.clustering:A component contained 158364 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 4.5125040046676256e-13\n",
-      "WARNING:dedupe.clustering:A component contained 158352 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.2269463311618112e-12\n",
-      "WARNING:dedupe.clustering:A component contained 158314 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 3.3356439660236965e-12\n",
-      "WARNING:dedupe.clustering:A component contained 157999 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 9.069763004960628e-12\n",
-      "WARNING:dedupe.clustering:A component contained 157528 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 2.4668471436592435e-11\n",
-      "WARNING:dedupe.clustering:A component contained 157002 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 6.705726454279488e-11\n",
-      "WARNING:dedupe.clustering:A component contained 156034 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.822899310600237e-10\n",
-      "WARNING:dedupe.clustering:A component contained 153167 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 4.955617200886392e-10\n",
-      "WARNING:dedupe.clustering:A component contained 150749 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.3472511446191333e-09\n",
-      "WARNING:dedupe.clustering:A component contained 148126 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 3.663451859737113e-09\n",
-      "WARNING:dedupe.clustering:A component contained 144445 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 9.961619074337575e-09\n",
-      "WARNING:dedupe.clustering:A component contained 140752 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 2.7079365851019727e-08\n",
-      "WARNING:dedupe.clustering:A component contained 136821 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 7.361173277021834e-08\n",
-      "WARNING:dedupe.clustering:A component contained 132985 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 2.00129481181664e-07\n",
-      "WARNING:dedupe.clustering:A component contained 129188 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 5.440113266301783e-07\n",
-      "WARNING:dedupe.clustering:A component contained 126461 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.4789049802767608e-06\n",
-      "WARNING:dedupe.clustering:A component contained 124279 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 4.020875427015852e-06\n",
-      "WARNING:dedupe.clustering:A component contained 121039 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 1.0930919387732102e-05\n",
-      "WARNING:dedupe.clustering:A component contained 117376 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 2.971327846301476e-05\n",
-      "WARNING:dedupe.clustering:A component contained 114455 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 8.076722851404745e-05\n",
-      "WARNING:dedupe.clustering:A component contained 109969 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.0002195200394847895\n",
-      "WARNING:dedupe.clustering:A component contained 106867 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.0005965000236037636\n",
-      "WARNING:dedupe.clustering:A component contained 101488 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.001619959237855584\n",
-      "WARNING:dedupe.clustering:A component contained 94945 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.004391296209574281\n",
-      "WARNING:dedupe.clustering:A component contained 89944 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.01184744490841891\n",
-      "WARNING:dedupe.clustering:A component contained 85759 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.031562819826922474\n",
-      "WARNING:dedupe.clustering:A component contained 79119 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.08138832068049236\n",
-      "WARNING:dedupe.clustering:A component contained 73185 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.1940994212134007\n",
-      "WARNING:dedupe.clustering:A component contained 67046 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.39565940016357204\n",
-      "WARNING:dedupe.clustering:A component contained 57601 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.6402437741869547\n",
-      "WARNING:dedupe.clustering:A component contained 36731 elements. Components larger than 30000 are re-filtered. The threshold for this filtering is 0.8286982262391892\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "s3://ria-temp/as_dedupe/as_entity_map/as_entity_map_0.csv\n",
-      "s3://ria-temp/as_dedupe/as_entity_map/as_entity_map_1.csv\n",
-      "s3://ria-temp/as_dedupe/as_entity_map/as_entity_map_2.csv\n",
-      "s3://ria-temp/as_dedupe/as_entity_map/as_entity_map_3.csv\n",
-      "s3://ria-temp/as_dedupe/as_entity_map/as_entity_map_4.csv\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "    athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_entity_map\", database=config.DATABASE)\n",
-    "\n",
+    "#     athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_entity_map\", database=config.DATABASE)\n",
+    "    athenautils.drop_external_table(\"as_entity_map\", \n",
+    "                                    location='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/'), \n",
+    "                                    database=config.DATABASE)\n",
+    "    \n",
     "    print('creating as_entity_map database')\n",
     "    q=\"\"\"\n",
     "    CREATE EXTERNAL TABLE as_entity_map     \n",
@@ -1117,45 +351,14 @@
     "    athenautils.athena_start_query(q, database=config.DATABASE) \n",
     "\n",
     "    athenautils.write_many(cluster_ids(clustered_dupes),\n",
-    "                          filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY, 'as_entity_map/as_entity_map.csv'))"
+    "                          filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/entity_map.csv'))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "# duplicate sets\n",
-      "Top Donors (deduped)\n",
-      "      $32,146,134.06: democratic party of illinois\n",
-      "      $13,762,181.54: republican state senate campaign committee\n",
-      "       $9,590,682.54: republican governors association\n",
-      "       $9,040,913.46: madigan michael friends of\n",
-      "       $7,949,218.49: seiu healthcare il in pac\n",
-      "       $6,435,815.20: chicago teachers union, ift local 1\n",
-      "       $6,353,463.90: illinois senate democratic fund (the)\n",
-      "       $6,077,259.02: fred eychaner\n",
-      "       $6,022,884.47: scott cohen\n",
-      "       $5,911,667.89: illinois republican party\n",
-      "Top Donors (raw)\n",
-      "      $14,319,194.47: democratic party of illinois\n",
-      "      $13,020,132.76: democratic party of illinois\n",
-      "       $9,027,432.54: republican governors association\n",
-      "       $7,897,829.31: rga illinois 2010 pac\n",
-      "       $6,675,000.00: madigan michael friends of\n",
-      "       $6,008,841.69: scott cohen\n",
-      "       $5,570,839.00: ronald gidwitz,\n",
-      "       $5,562,800.00: citizens for emil jones\n",
-      "       $5,324,649.63: paul wood,\n",
-      "       $5,132,563.83: seiu healthcare il in\n",
-      "ran in 3723.114373922348 seconds\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "    # Print out the number of duplicates found\n",
     "    print('# duplicate sets')\n",
@@ -1193,7 +396,7 @@
     "        ON as_donors.donor_id = donation_totals.canon_id\n",
     "        ORDER BY totals DESC\n",
     "    \"\"\"\n",
-    "    cur = athenautils.cursor_execute(q, database=config.DATABASE)\n",
+    "    cur = dict_cursor_execute(q, database=config.DATABASE)\n",
     "\n",
     "    print(\"Top Donors (deduped)\")\n",
     "    for row in cur:\n",
@@ -1217,27 +420,32 @@
     "        ORDER BY totals DESC \n",
     "        LIMIT 10\n",
     "    \"\"\"\n",
-    "    cur = athenautils.cursor_execute(q, database=config.DATABASE)\n",
+    "    cur = dict_cursor_execute(q, database=config.DATABASE)\n",
     "\n",
     "    print(\"Top Donors (raw)\")\n",
     "    for row in cur:\n",
     "        row['totals'] = locale.currency(row['totals'], grouping=True)\n",
     "        print('%(totals)20s: %(name)s' % row)\n",
     "\n",
-    "    # Close our database connection\n",
-    "#     read_con.close()\n",
-    "#     write_con.close()\n",
-    "\n",
     "    print('ran in', time.time() - start_time, 'seconds')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NbConvertApp] Converting notebook athena_example.ipynb to script\n",
+      "[NbConvertApp] Writing 12622 bytes to ../athena_example/athena_example.py\n"
+     ]
+    }
+   ],
    "source": [
-    "# !jupyter nbconvert --to script athena_example.ipynb --output-dir=../athena_example/"
+    "!jupyter nbconvert --to script athena_example.ipynb --output-dir=../athena_example/\n"
    ]
   },
   {
diff --git a/notebooks/athena_init_db.ipynb b/notebooks/athena_init_db.ipynb
index a059f520..d35250de 100644
--- a/notebooks/athena_init_db.ipynb
+++ b/notebooks/athena_init_db.ipynb
@@ -2,20 +2,22 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install dedupe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Overwriting ../athena_example/config.py\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "%%writefile ../athena_example/config.py\n",
     "LOG_FILE = 'log.txt'\n",
+    "\n",
     "# Connection parameters\n",
     "ACCESS_KEY_ID = None\n",
     "SECRET_ACCESS_KEY = None\n",
@@ -32,17 +34,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Overwriting ../athena_example/athena_init.py\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "%%writefile ../athena_example/athena_init.py\n",
     "#!/usr/bin/python\n",
@@ -98,7 +92,9 @@
     "\n",
     "\n",
     "print('importing raw data from csv...')\n",
-    "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_raw_table\", database=config.DATABASE)\n",
+    "athenautils.drop_external_table(\"as_raw_table\", \n",
+    "                                location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table'),\n",
+    "                                database=config.DATABASE)    \n",
     "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_donors\", database=config.DATABASE)\n",
     "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_recipients\", database=config.DATABASE)\n",
     "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_contributions\", database=config.DATABASE)\n",
@@ -134,39 +130,40 @@
     "athenautils.athena_start_query(q, database=config.DATABASE)\n",
     "\n",
     "\n",
-    "df = pd.read_csv(contributions_txt_file, sep='\\t', escapechar='\\\\', quoting=csv.QUOTE_NONE,  \n",
-    "                 error_bad_lines=False, warn_bad_lines=True, dtype=str, keep_default_na=False, na_values=[''])#,\n",
-    "\n",
-    "# Remove the very few records that mess up the demo \n",
-    "# (demo purposes only! Don't do something like this in production)\n",
-    "df = df[df['RcvDate'].str.len()>=10]\n",
-    "\n",
-    "# set empty, non-zero, strings in date columns to null\n",
-    "df.loc[df['RptPdBegDate'].str.len()<10,'RptPdBegDate'] = np.nan\n",
-    "\n",
-    "df.loc[df['RptPdEndDate'].str.len()<10,'RptPdEndDate'] = np.nan\n",
-    "\n",
-    "#committee ID is requred. Remove the 2 rows that don't have it.\n",
-    "df = df[df['ID']!='']\n",
-    "\n",
-    "# There's a record with a date stuck in the committee_id column, which causes\n",
-    "# problems when inserting into the contributions table below. Get rid of it this \n",
-    "# way.\n",
-    "df = df[df['ID'].str.len() <=9]\n",
-    "\n",
-    "# dropping the last columns\n",
-    "df = df.drop(columns='Unnamed: 29')\n",
-    "\n",
-    "# Nullifying empty strings\n",
-    "# df = df.replace(r'^\\s*$', np.nan, regex=True)\n",
-    "df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand')\n",
-    "\n",
-    "athenautils.write(body=df_lower.to_csv(quoting=csv.QUOTE_NONE, sep=\"\\t\", escapechar='\\\\', index=None),\n",
-    "           filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'as_raw_table', contributions_txt_file,))\n",
-    "\n",
-    "# Athena is doesn't equate empty string and null, eventhough in the table spec we said so\n",
-    "# Not that it's a bug, it works if the string is null in the source, but not after applying trim to it\n",
-    "# So we need to manually take care of that\n",
+    "df_cursor = pd.read_csv(contributions_txt_file, sep='\\t', escapechar='\\\\', quoting=csv.QUOTE_NONE,  \n",
+    "                        error_bad_lines=False, warn_bad_lines=True, dtype=str, keep_default_na=False, na_values=[''],\n",
+    "                        chunksize=config.BUFFERSIZE)\n",
+    "chunkcount = 0\n",
+    "filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'as_raw_table', os.path.splitext(contributions_txt_file)[0]+'.csv')\n",
+    "for df in df_cursor: \n",
+    "    # Remove the very few records that mess up the demo \n",
+    "    # (demo purposes only! Don't do something like this in production)\n",
+    "    df = df[df['RcvDate'].str.len()>=10]\n",
+    "\n",
+    "    # set empty, non-zero, strings in date columns to null\n",
+    "    df.loc[df['RptPdBegDate'].str.len()<10,'RptPdBegDate'] = np.nan\n",
+    "\n",
+    "    df.loc[df['RptPdEndDate'].str.len()<10,'RptPdEndDate'] = np.nan\n",
+    "\n",
+    "    #committee ID is requred. Remove the 2 rows that don't have it.\n",
+    "    df = df[df['ID']!='']\n",
+    "\n",
+    "    # There's a record with a date stuck in the committee_id column, which causes\n",
+    "    # problems when inserting into the contributions table below. Get rid of it this \n",
+    "    # way.\n",
+    "    df = df[df['ID'].str.len() <=9]\n",
+    "\n",
+    "    # dropping the last columns\n",
+    "    df = df.drop(columns='Unnamed: 29')\n",
+    "\n",
+    "    df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand')\n",
+    "    \n",
+    "    buffer = df_lower.to_csv(quoting=csv.QUOTE_NONE, sep=\"\\t\", escapechar='\\\\', index=None)\n",
+    "    \n",
+    "    chunk_fname = athenautils.file_name_append(filename, '_{}'.format(chunkcount), ommitext=False)\n",
+    "    athenautils.write(body=buffer, filename=chunk_fname)\n",
+    "    chunkcount += 1    \n",
+    "    \n",
     "print('creating donors table...')\n",
     "q=\"\"\"\n",
     "CREATE TABLE as_donors as\n",
@@ -194,22 +191,6 @@
     "\n",
     "print('creating contributions table')\n",
     "\n",
-    "# --\n",
-    "# c.execute(\"CREATE TABLE contributions \"\n",
-    "#           \"(contribution_id INT, donor_id INT, recipient_id INT, \"\n",
-    "#           \" report_type VARCHAR(24), date_recieved DATE, \"\n",
-    "#           \" loan_amount VARCHAR(12), amount VARCHAR(23), \"\n",
-    "#           \" receipt_type VARCHAR(23), \"\n",
-    "#           \" vendor_last_name VARCHAR(70), \"\n",
-    "#           \" vendor_first_name VARCHAR(20), \"\n",
-    "#           \" vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), \"\n",
-    "#           \" vendor_city VARCHAR(20), vendor_state VARCHAR(10), \"\n",
-    "#           \" vendor_zip VARCHAR(10), description VARCHAR(90), \"\n",
-    "#           \" election_type VARCHAR(10), election_year VARCHAR(10), \"\n",
-    "#           \" report_period_begin DATE, report_period_end DATE) \"\n",
-    "#           \"CHARACTER SET utf8 COLLATE utf8_unicode_ci\")\n",
-    "# --\n",
-    "\n",
     "q=\"\"\"\n",
     "CREATE TABLE as_contributions as\n",
     "    SELECT reciept_id as contribution_id, \n",
@@ -264,35 +245,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "importing raw data from csv...\n",
-      "b'Skipping line 1441352: expected 30 fields, saw 31\\n'\n",
-      "b'Skipping line 1465996: expected 30 fields, saw 31\\n'\n",
-      "b'Skipping line 1495732: expected 30 fields, saw 31\\n'\n",
-      "b'Skipping line 1631504: expected 30 fields, saw 31\\nSkipping line 1631506: expected 30 fields, saw 31\\n'\n",
-      "b'Skipping line 1660260: expected 30 fields, saw 31\\nSkipping line 1660264: expected 30 fields, saw 32\\n'\n",
-      "creating donors table...\n",
-      "creating contributions table\n",
-      "done\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "!python ../athena_example/athena_init.py"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

From 5636f41b6ac9531ca2d6118f4fbfe520aae57711 Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-10-10-10-91.eu-west-1.compute.internal>
Date: Wed, 4 Nov 2020 06:37:31 +0000
Subject: [PATCH 11/19] checkpoint

---
 athena_example/athena_example.py | 332 ++-----------------------------
 notebooks/athena_example.ipynb   | 145 +++++---------
 2 files changed, 68 insertions(+), 409 deletions(-)

diff --git a/athena_example/athena_example.py b/athena_example/athena_example.py
index 384172f2..7cedbfe6 100644
--- a/athena_example/athena_example.py
+++ b/athena_example/athena_example.py
@@ -1,8 +1,3 @@
-#!/usr/bin/env python
-# coding: utf-8
-
-# In[ ]:
-
 
 """
 This is an example of working with very large data. There are about
@@ -80,8 +75,8 @@ def cluster_ids(clustered_dupes):
         for donor_id, score in zip(cluster, scores):
             yield donor_id, cluster_id, score
 
-
-if __name__ == '__main__':
+if True:
+# if __name__ == '__main__':
 
     ## Logging
 
@@ -89,20 +84,20 @@ def cluster_ids(clustered_dupes):
     # for convenience.  To enable verbose output, run `python
     # examples/mysql_example/mysql_example.py -v`
     
-    optp = optparse.OptionParser()
-    optp.add_option('-v', '--verbose', dest='verbose', action='count',
-                    help='Increase verbosity (specify multiple times for more)'
-                    )
-    (opts, args) = optp.parse_args()
+#     optp = optparse.OptionParser()
+#     optp.add_option('-v', '--verbose', dest='verbose', action='count',
+#                     help='Increase verbosity (specify multiple times for more)'
+#                     )
+#     (opts, args) = optp.parse_args()
     log_level = logging.WARNING
-    if opts.verbose:
-        if opts.verbose == 1:
-            log_level = logging.INFO
-        elif opts.verbose >= 2:
-            log_level = logging.DEBUG
+#     if opts.verbose:
+#         if opts.verbose == 1:
+#             log_level = logging.INFO
+#         elif opts.verbose >= 2:
+#             log_level = logging.DEBUG
 
 
-    logging.getLogger().setLevel(log_level)
+#     logging.getLogger().setLevel(log_level)
 
     
 
@@ -111,304 +106,3 @@ def cluster_ids(clustered_dupes):
     training_file = 'mysql_example_training.json'
 
     start_time = time.time()
-
-
-# In[ ]:
-
-
-# We'll be using variations on this following select statement to pull
-# in campaign donor info.
-#
-# We did a fair amount of preprocessing of the fields in
-# `mysql_init_db.py`    
-DONOR_SELECT = """SELECT donor_id, city, name, zip, state, address
-                  from as_processed_donors"""
-
-# ## Training
-
-if os.path.exists(settings_file):
-    print('reading from ', settings_file)
-    with open(settings_file, 'rb') as sf:
-        deduper = dedupe.StaticDedupe(sf, num_cores=4)
-else:
-    # Define the fields dedupe will pay attention to
-    #
-    # The address, city, and zip fields are often missing, so we'll
-    # tell dedupe that, and we'll learn a model that take that into
-    # account
-    fields = [{'field': 'name', 'type': 'String'},
-              {'field': 'address', 'type': 'String',
-               'has missing': True},
-              {'field': 'city', 'type': 'ShortString', 'has missing': True},
-              {'field': 'state', 'type': 'ShortString', 'has missing': True},
-              {'field': 'zip', 'type': 'ShortString', 'has missing': True},
-              ]
-
-    # Create a new deduper object and pass our data model to it.
-    deduper = dedupe.Dedupe(fields, num_cores=4)
-
-    # We will sample pairs from the entire donor table for training
-    cur = cur_execute(DONOR_SELECT)
-    temp_d = {i: row for i, row in enumerate(cur)}
-        
-
-    # If we have training data saved from a previous run of dedupe,
-    # look for it an load it in.
-    #
-    # __Note:__ if you want to train from
-    # scratch, delete the training_file
-    if os.path.exists(training_file):
-        print('reading labeled examples from ', training_file)
-        with open(training_file) as tf:
-            deduper.prepare_training(temp_d, training_file=tf)
-    else:
-        deduper.prepare_training(temp_d)
-
-    del temp_d
-
-    # ## Active learning
-
-    print('starting active labeling...')
-    # Starts the training loop. Dedupe will find the next pair of records
-    # it is least certain about and ask you to label them as duplicates
-    # or not.
-
-    # use 'y', 'n' and 'u' keys to flag duplicates
-    # press 'f' when you are finished
-    dedupe.convenience.console_label(deduper)
-    # When finished, save our labeled, training pairs to disk
-    with open(training_file, 'w') as tf:
-        deduper.write_training(tf)
-
-    # Notice our the argument here
-    #
-    # `recall` is the proportion of true dupes pairs that the learned
-    # rules must cover. You may want to reduce this if your are making
-    # too many blocks and too many comparisons.
-    deduper.train(recall=0.90)
-
-    with open(settings_file, 'wb') as sf:
-        deduper.write_settings(sf)
-
-    # We can now remove some of the memory hobbing objects we used
-    # for training
-    deduper.cleanup_training()
-
-
-# In[ ]:
-
-
-# ## Blocking
-
-print('blocking...')
-
-# To run blocking on such a large set of data, we create a separate table
-# that contains blocking keys and record ids
-print('creating as_blocking_map database')
-athenautils.drop_external_table("as_blocking_map", 
-                                location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map'),
-                                database=config.DATABASE)
-
-q="""
-CREATE EXTERNAL TABLE as_blocking_map     
-    (block_key VARCHAR(200), donor_id INTEGER)
-ROW FORMAT DELIMITED
-  FIELDS TERMINATED BY '\t'
-  LINES TERMINATED BY '\n'  
-LOCATION
-    's3://{}/{}' 
-TBLPROPERTIES (
-    'classification'='csv', 
-    --'skip.header.line.count'='1',  
-    'serialization.null.format'='')
-""".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map') 
-athenautils.athena_start_query(q, database=config.DATABASE)
-
-
-# In[ ]:
-
-
-# If dedupe learned a Index Predicate, we have to take a pass
-# through the data and create indices.
-print('creating inverted index')
-
-# Armin: 
-# This never runs, index_fields is empty, possible bug?
-for field in deduper.fingerprinter.index_fields:
-    q = """
-    SELECT DISTINCT {field} FROM as_processed_donors
-    WHERE {field} IS NOT NULL
-    """.format(field=field)
-    cur = cur_execute(q)
-    field_data = (row[field] for row in cur)
-    deduper.fingerprinter.index(field_data, field)
- 
-
-
-# In[ ]:
-
-
-# Now we are ready to write our blocking map table by creating a
-# generator that yields unique `(block_key, donor_id)` tuples.
-print('writing blocking map')
-
-read_cur  = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE)
-full_data = ((row['donor_id'], row) for row in read_cur)
-
-
-# In[ ]:
-
-
-b_data = deduper.fingerprinter(full_data)
-athenautils.write_many(b_data, 
-                       filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map/blocking.csv'))
-
-
-# In[ ]:
-
-
-
-    # select unique pairs to compare
-    q="""
-        SELECT a.donor_id,
-            json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'],
-                                  ARRAY[ a.city, a.name, a.zip, a.state, a.address])
-                        AS JSON)),
-            b.donor_id,
-            json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], 
-                      ARRAY[ b.city, b.name, b.zip, b.state, b.address])
-                  AS JSON))
-        FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west
-             from as_blocking_map as l
-             INNER JOIN as_blocking_map as r
-             using (block_key)
-             where l.donor_id < r.donor_id) ids
-        INNER JOIN as_processed_donors a on ids.east=a.donor_id
-        INNER JOIN as_processed_donors b on ids.west=b.donor_id
-       """
-    read_cur = cursor_execute(q, database=config.DATABASE)
-
-
-# In[ ]:
-
-
-# ## Clustering
-
-print('clustering...')
-clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)),
-                                  threshold=0.5)
-
-
-# In[ ]:
-
-
-#     athenautils.athena_start_query("DROP TABLE IF EXISTS as_entity_map", database=config.DATABASE)
-    athenautils.drop_external_table("as_entity_map", 
-                                    location='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/'), 
-                                    database=config.DATABASE)
-    
-    print('creating as_entity_map database')
-    q="""
-    CREATE EXTERNAL TABLE as_entity_map     
-        (donor_id INTEGER, canon_id INTEGER, 
-         cluster_score FLOAT)
-    ROW FORMAT DELIMITED
-      FIELDS TERMINATED BY '\t'
-      LINES TERMINATED BY '\n'  
-    LOCATION
-        's3://{}/{}' 
-    TBLPROPERTIES (
-        'classification'='csv', 
-        --'skip.header.line.count'='1',  
-        'serialization.null.format'='')
-    """.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map') 
-    athenautils.athena_start_query(q, database=config.DATABASE) 
-
-    athenautils.write_many(cluster_ids(clustered_dupes),
-                          filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/entity_map.csv'))
-
-
-# In[ ]:
-
-
-# Print out the number of duplicates found
-print('# duplicate sets')
-
-# ## Payoff
-
-# With all this done, we can now begin to ask interesting questions
-# of the data
-#
-# For example, let's see who the top 10 donors are.
-
-locale.setlocale(locale.LC_ALL, 'en_CA.UTF-8')  # for pretty printing numbers
-
-athenautils.athena_start_query("DROP TABLE IF EXISTS as_e_map", database=config.DATABASE)
-
-q = """
-    CREATE TABLE as_e_map as 
-    SELECT COALESCE(canon_id, as_entity_map.donor_id) AS canon_id, as_entity_map.donor_id 
-    FROM as_entity_map 
-    RIGHT JOIN as_donors USING(donor_id)        
-    """    
-athenautils.athena_start_query(q, database=config.DATABASE)
-
-q = """
-    SELECT array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,   
-        donation_totals.totals AS totals 
-    FROM as_donors INNER JOIN 
-        (SELECT canon_id, SUM(cast (amount as double)) AS totals 
-        FROM as_contributions INNER JOIN as_e_map 
-        USING (donor_id) 
-        GROUP BY (canon_id) 
-        ORDER BY totals 
-        DESC LIMIT 10) 
-        AS donation_totals 
-    ON as_donors.donor_id = donation_totals.canon_id
-    ORDER BY totals DESC
-"""
-cur = dict_cursor_execute(q, database=config.DATABASE)
-
-print("Top Donors (deduped)")
-for row in cur:
-    row['totals'] = locale.currency(row['totals'], grouping=True)
-    print('%(totals)20s: %(name)s' % row)
-
-# Compare this to what we would have gotten if we hadn't done any
-# deduplication
-q = """
-    with donorscontributions as(
-
-        SELECT as_donors.donor_id, 
-            array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,
-            cast(as_contributions.amount as double) as amount
-        FROM as_donors INNER JOIN as_contributions 
-            USING (donor_id) 
-        )
-    SELECT name, sum(amount) AS totals  
-    FROM donorscontributions
-    GROUP BY donor_id, name
-    ORDER BY totals DESC 
-    LIMIT 10
-"""
-cur = dict_cursor_execute(q, database=config.DATABASE)
-
-print("Top Donors (raw)")
-for row in cur:
-    row['totals'] = locale.currency(row['totals'], grouping=True)
-    print('%(totals)20s: %(name)s' % row)
-
-print('ran in', time.time() - start_time, 'seconds')
-
-
-# In[ ]:
-
-
-get_ipython().system('jupyter nbconvert --to script athena_example.ipynb --output-dir=../athena_example/')
-
-
-# In[ ]:
-
-
-
-
diff --git a/notebooks/athena_example.ipynb b/notebooks/athena_example.ipynb
index e452939c..69edb207 100644
--- a/notebooks/athena_example.ipynb
+++ b/notebooks/athena_example.ipynb
@@ -2,10 +2,20 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting ../athena_example/athena_example.py\n"
+     ]
+    }
+   ],
    "source": [
+    "%%writefile ../athena_example/athena_example.py\n",
+    "\n",
     "\"\"\"\n",
     "This is an example of working with very large data. There are about\n",
     "700,000 unduplicated donors in this database of Illinois political\n",
@@ -112,15 +122,8 @@
     "    settings_file = 'mysql_example_settings'\n",
     "    training_file = 'mysql_example_training.json'\n",
     "\n",
-    "    start_time = time.time()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "    start_time = time.time()\n",
+    "\n",
     "    # We'll be using variations on this following select statement to pull\n",
     "    # in campaign donor info.\n",
     "    #\n",
@@ -153,7 +156,7 @@
     "        deduper = dedupe.Dedupe(fields, num_cores=4)\n",
     "\n",
     "        # We will sample pairs from the entire donor table for training\n",
-    "        cur = cur_execute(DONOR_SELECT)\n",
+    "        cur = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE)\n",
     "        temp_d = {i: row for i, row in enumerate(cur)}\n",
     "            \n",
     "\n",
@@ -197,15 +200,8 @@
     "\n",
     "        # We can now remove some of the memory hobbing objects we used\n",
     "        # for training\n",
-    "        deduper.cleanup_training()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "        deduper.cleanup_training()\n",
+    "\n",
     "    # ## Blocking\n",
     "\n",
     "    print('blocking...')\n",
@@ -230,15 +226,8 @@
     "        --'skip.header.line.count'='1',  \n",
     "        'serialization.null.format'='')\n",
     "    \"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map') \n",
-    "    athenautils.athena_start_query(q, database=config.DATABASE)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "    athenautils.athena_start_query(q, database=config.DATABASE)\n",
+    "\n",
     "    # If dedupe learned a Index Predicate, we have to take a pass\n",
     "    # through the data and create indices.\n",
     "    print('creating inverted index')\n",
@@ -250,43 +239,22 @@
     "        SELECT DISTINCT {field} FROM as_processed_donors\n",
     "        WHERE {field} IS NOT NULL\n",
     "        \"\"\".format(field=field)\n",
-    "        cur = cur_execute(q)\n",
+    "        cur = dict_cursor_execute(q, databse=config.DATABASE)\n",
     "        field_data = (row[field] for row in cur)\n",
     "        deduper.fingerprinter.index(field_data, field)\n",
-    "     "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "     \n",
+    "\n",
     "    # Now we are ready to write our blocking map table by creating a\n",
     "    # generator that yields unique `(block_key, donor_id)` tuples.\n",
     "    print('writing blocking map')\n",
     "    \n",
     "    read_cur  = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE)\n",
-    "    full_data = ((row['donor_id'], row) for row in read_cur)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "    full_data = ((row['donor_id'], row) for row in read_cur)\n",
+    "\n",
     "    b_data = deduper.fingerprinter(full_data)\n",
     "    athenautils.write_many(b_data, \n",
-    "                           filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map/blocking.csv'))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "                           filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map/blocking.csv'))\n",
+    "\n",
     "\n",
     "    # select unique pairs to compare\n",
     "    q=\"\"\"\n",
@@ -306,28 +274,15 @@
     "        INNER JOIN as_processed_donors a on ids.east=a.donor_id\n",
     "        INNER JOIN as_processed_donors b on ids.west=b.donor_id\n",
     "       \"\"\"\n",
-    "    read_cur = cursor_execute(q, database=config.DATABASE)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "    read_cur = cursor_execute(q, database=config.DATABASE)\n",
+    "\n",
+    "\n",
     "    # ## Clustering\n",
     "\n",
     "    print('clustering...')\n",
     "    clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)),\n",
-    "                                      threshold=0.5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "                                      threshold=0.5)\n",
+    "\n",
     "#     athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_entity_map\", database=config.DATABASE)\n",
     "    athenautils.drop_external_table(\"as_entity_map\", \n",
     "                                    location='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/'), \n",
@@ -351,15 +306,8 @@
     "    athenautils.athena_start_query(q, database=config.DATABASE) \n",
     "\n",
     "    athenautils.write_many(cluster_ids(clustered_dupes),\n",
-    "                          filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/entity_map.csv'))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "                          filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/entity_map.csv'))\n",
+    "\n",
     "    # Print out the number of duplicates found\n",
     "    print('# duplicate sets')\n",
     "\n",
@@ -427,25 +375,42 @@
     "        row['totals'] = locale.currency(row['totals'], grouping=True)\n",
     "        print('%(totals)20s: %(name)s' % row)\n",
     "\n",
-    "    print('ran in', time.time() - start_time, 'seconds')"
+    "    print('ran in', time.time() - start_time, 'seconds')\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[NbConvertApp] Converting notebook athena_example.ipynb to script\n",
-      "[NbConvertApp] Writing 12622 bytes to ../athena_example/athena_example.py\n"
+      "^C\r\n",
+      "Traceback (most recent call last):\r\n",
+      "  File \"../athena_example/athena_example.py\", line 156, in <module>\r\n",
+      "    deduper.prepare_training(temp_d)\r\n",
+      "  File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/api.py\", line 1249, in prepare_training\r\n",
+      "    self._sample(data, sample_size, blocked_proportion, original_length)\r\n",
+      "  File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/api.py\", line 1287, in _sample\r\n",
+      "    index_include=examples)\r\n",
+      "  File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/labeler.py\", line 418, in __init__\r\n",
+      "    index_include)\r\n",
+      "  File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/labeler.py\", line 246, in __init__\r\n",
+      "    index_data)\r\n",
+      "  File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/training.py\", line 128, in __init__\r\n",
+      "    simple_cover = self.coveredPairs(self.blocker, sampled_records)\r\n",
+      "  File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/training.py\", line 156, in coveredPairs\r\n",
+      "    for block in pred_cover.values()\r\n",
+      "  File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/training.py\", line 157, in <setcomp>\r\n",
+      "    for pair in itertools.combinations(sorted(block), 2)}\r\n",
+      "KeyboardInterrupt\r\n"
      ]
     }
    ],
    "source": [
-    "!jupyter nbconvert --to script athena_example.ipynb --output-dir=../athena_example/\n"
+    "!python ../athena_example/athena_example.py"
    ]
   },
   {

From 7a8caf62200649399c24f2fddeb6cdc16296302f Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-10-10-10-91.eu-west-1.compute.internal>
Date: Thu, 5 Nov 2020 13:46:04 +0000
Subject: [PATCH 12/19] final version

---
 athena_example/athena_example.py | 293 ++++++++++++++++++--
 athena_example/utils.py          | 139 ++++++++++
 notebooks/athena_example.ipynb   | 452 -------------------------------
 notebooks/athena_init_db.ipynb   | 277 -------------------
 4 files changed, 412 insertions(+), 749 deletions(-)
 create mode 100644 athena_example/utils.py
 delete mode 100644 notebooks/athena_example.ipynb
 delete mode 100644 notebooks/athena_init_db.ipynb

diff --git a/athena_example/athena_example.py b/athena_example/athena_example.py
index 7cedbfe6..c1c7fb3c 100644
--- a/athena_example/athena_example.py
+++ b/athena_example/athena_example.py
@@ -8,18 +8,18 @@
 we need to make in memory. Instead, we will read the pairs on demand
 from the Athena database.
 
-__Note:__ You will need to run `python mysql_init_db.py`
+__Note:__ You will need to run `python athena_init_db.py`
 before running this script. See the annotates source for
-[mysql_init_db.py](mysql_init_db.html)
+[athena_init_db.py](athena_init_db.html)
 
 For smaller datasets (<10,000), see our
 [csv_example](csv_example.html)
 """
 
 # There is a little bit difference between the result 
-# of this module and the mysql one. The reason is due to
+# of this module and the athena one. The reason is due to
 # Some special (and mostly erroneous) characters, such as \a .. 
-# Which are dealt with differently by mysql and athena/panda
+# Which are dealt with differently by athena and athena/panda
 
 import sys
 import os
@@ -75,34 +75,287 @@ def cluster_ids(clustered_dupes):
         for donor_id, score in zip(cluster, scores):
             yield donor_id, cluster_id, score
 
-if True:
-# if __name__ == '__main__':
+
+if __name__ == '__main__':
 
     ## Logging
 
     # Dedupe uses Python logging to show or suppress verbose output. Added
     # for convenience.  To enable verbose output, run `python
-    # examples/mysql_example/mysql_example.py -v`
+    # examples/athena_example/athena_example.py -v`
     
-#     optp = optparse.OptionParser()
-#     optp.add_option('-v', '--verbose', dest='verbose', action='count',
-#                     help='Increase verbosity (specify multiple times for more)'
-#                     )
-#     (opts, args) = optp.parse_args()
+    optp = optparse.OptionParser()
+    optp.add_option('-v', '--verbose', dest='verbose', action='count',
+                    help='Increase verbosity (specify multiple times for more)'
+                    )
+    (opts, args) = optp.parse_args()
     log_level = logging.WARNING
-#     if opts.verbose:
-#         if opts.verbose == 1:
-#             log_level = logging.INFO
-#         elif opts.verbose >= 2:
-#             log_level = logging.DEBUG
+    if opts.verbose:
+        if opts.verbose == 1:
+            log_level = logging.INFO
+        elif opts.verbose >= 2:
+            log_level = logging.DEBUG
 
 
-#     logging.getLogger().setLevel(log_level)
+    logging.getLogger().setLevel(log_level)
 
     
 
 
-    settings_file = 'mysql_example_settings'
-    training_file = 'mysql_example_training.json'
+    settings_file = 'athena_example_settings'
+    training_file = 'athena_example_training.json'
 
     start_time = time.time()
+
+    # We'll be using variations on this following select statement to pull
+    # in campaign donor info.
+    #
+    # We did a fair amount of preprocessing of the fields in
+    # `athena_init_db.py`    
+    DONOR_SELECT = """SELECT donor_id, city, name, zip, state, address
+                      from as_processed_donors"""
+
+    # ## Training
+
+    if os.path.exists(settings_file):
+        print('reading from ', settings_file)
+        with open(settings_file, 'rb') as sf:
+            deduper = dedupe.StaticDedupe(sf, num_cores=4)
+    else:
+        # Define the fields dedupe will pay attention to
+        #
+        # The address, city, and zip fields are often missing, so we'll
+        # tell dedupe that, and we'll learn a model that take that into
+        # account
+        fields = [{'field': 'name', 'type': 'String'},
+                  {'field': 'address', 'type': 'String',
+                   'has missing': True},
+                  {'field': 'city', 'type': 'ShortString', 'has missing': True},
+                  {'field': 'state', 'type': 'ShortString', 'has missing': True},
+                  {'field': 'zip', 'type': 'ShortString', 'has missing': True},
+                  ]
+
+        # Create a new deduper object and pass our data model to it.
+        deduper = dedupe.Dedupe(fields, num_cores=4)
+
+        # We will sample pairs from the entire donor table for training
+        cur = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE)
+        temp_d = {i: row for i, row in enumerate(cur)}
+            
+
+        # If we have training data saved from a previous run of dedupe,
+        # look for it an load it in.
+        #
+        # __Note:__ if you want to train from
+        # scratch, delete the training_file
+        if os.path.exists(training_file):
+            print('reading labeled examples from ', training_file)
+            with open(training_file) as tf:
+                deduper.prepare_training(temp_d, training_file=tf)
+        else:
+            deduper.prepare_training(temp_d)
+
+        del temp_d
+
+        # ## Active learning
+
+        print('starting active labeling...')
+        # Starts the training loop. Dedupe will find the next pair of records
+        # it is least certain about and ask you to label them as duplicates
+        # or not.
+
+        # use 'y', 'n' and 'u' keys to flag duplicates
+        # press 'f' when you are finished
+        dedupe.convenience.console_label(deduper)
+        # When finished, save our labeled, training pairs to disk
+        with open(training_file, 'w') as tf:
+            deduper.write_training(tf)
+
+        # Notice our the argument here
+        #
+        # `recall` is the proportion of true dupes pairs that the learned
+        # rules must cover. You may want to reduce this if your are making
+        # too many blocks and too many comparisons.
+        deduper.train(recall=0.90)
+
+        with open(settings_file, 'wb') as sf:
+            deduper.write_settings(sf)
+
+        # We can now remove some of the memory hobbing objects we used
+        # for training
+        deduper.cleanup_training()
+
+    # ## Blocking
+
+    print('blocking...')
+
+    # To run blocking on such a large set of data, we create a separate table
+    # that contains blocking keys and record ids
+    print('creating as_blocking_map database')
+    athenautils.drop_external_table("as_blocking_map", 
+                                    location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map'),
+                                    database=config.DATABASE)
+
+    q="""
+    CREATE EXTERNAL TABLE as_blocking_map     
+        (block_key VARCHAR(200), donor_id INTEGER)
+    ROW FORMAT DELIMITED
+      FIELDS TERMINATED BY '\t'
+      LINES TERMINATED BY '\n'  
+    LOCATION
+        's3://{}/{}' 
+    TBLPROPERTIES (
+        'classification'='csv', 
+        --'skip.header.line.count'='1',  
+        'serialization.null.format'='')
+    """.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map') 
+    athenautils.athena_start_query(q, database=config.DATABASE)
+
+    # If dedupe learned a Index Predicate, we have to take a pass
+    # through the data and create indices.
+    print('creating inverted index')
+
+    # Armin: 
+    # This never runs, index_fields is empty, possible bug?
+    for field in deduper.fingerprinter.index_fields:
+        q = """
+        SELECT DISTINCT {field} FROM as_processed_donors
+        WHERE {field} IS NOT NULL
+        """.format(field=field)
+        cur = dict_cursor_execute(q, databse=config.DATABASE)
+        field_data = (row[field] for row in cur)
+        deduper.fingerprinter.index(field_data, field)
+     
+
+    # Now we are ready to write our blocking map table by creating a
+    # generator that yields unique `(block_key, donor_id)` tuples.
+    print('writing blocking map')
+    
+    read_cur  = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE)
+    full_data = ((row['donor_id'], row) for row in read_cur)
+
+    b_data = deduper.fingerprinter(full_data)
+    athenautils.write_many(b_data, 
+                           filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map/blocking.csv'))
+
+
+    # select unique pairs to compare
+    q="""
+        SELECT a.donor_id,
+            json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'],
+                                  ARRAY[ a.city, a.name, a.zip, a.state, a.address])
+                        AS JSON)),
+            b.donor_id,
+            json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], 
+                      ARRAY[ b.city, b.name, b.zip, b.state, b.address])
+                  AS JSON))
+        FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west
+             from as_blocking_map as l
+             INNER JOIN as_blocking_map as r
+             using (block_key)
+             where l.donor_id < r.donor_id) ids
+        INNER JOIN as_processed_donors a on ids.east=a.donor_id
+        INNER JOIN as_processed_donors b on ids.west=b.donor_id
+       """
+    read_cur = cursor_execute(q, database=config.DATABASE)
+
+
+    # ## Clustering
+
+    print('clustering...')
+    clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)),
+                                      threshold=0.5)
+
+#     athenautils.athena_start_query("DROP TABLE IF EXISTS as_entity_map", database=config.DATABASE)
+    athenautils.drop_external_table("as_entity_map", 
+                                    location='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/'), 
+                                    database=config.DATABASE)
+    
+    print('creating as_entity_map database')
+    q="""
+    CREATE EXTERNAL TABLE as_entity_map     
+        (donor_id INTEGER, canon_id INTEGER, 
+         cluster_score FLOAT)
+    ROW FORMAT DELIMITED
+      FIELDS TERMINATED BY '\t'
+      LINES TERMINATED BY '\n'  
+    LOCATION
+        's3://{}/{}' 
+    TBLPROPERTIES (
+        'classification'='csv', 
+        --'skip.header.line.count'='1',  
+        'serialization.null.format'='')
+    """.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map') 
+    athenautils.athena_start_query(q, database=config.DATABASE) 
+
+    athenautils.write_many(cluster_ids(clustered_dupes),
+                          filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/entity_map.csv'))
+
+    # Print out the number of duplicates found
+    print('# duplicate sets')
+
+    # ## Payoff
+
+    # With all this done, we can now begin to ask interesting questions
+    # of the data
+    #
+    # For example, let's see who the top 10 donors are.
+
+    locale.setlocale(locale.LC_ALL, 'en_CA.UTF-8')  # for pretty printing numbers
+    
+    athenautils.athena_start_query("DROP TABLE IF EXISTS as_e_map", database=config.DATABASE)
+    
+    q = """
+        CREATE TABLE as_e_map as 
+        SELECT COALESCE(canon_id, as_entity_map.donor_id) AS canon_id, as_entity_map.donor_id 
+        FROM as_entity_map 
+        RIGHT JOIN as_donors USING(donor_id)        
+        """    
+    athenautils.athena_start_query(q, database=config.DATABASE)
+    
+    q = """
+        SELECT array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,   
+            donation_totals.totals AS totals 
+        FROM as_donors INNER JOIN 
+            (SELECT canon_id, SUM(cast (amount as double)) AS totals 
+            FROM as_contributions INNER JOIN as_e_map 
+            USING (donor_id) 
+            GROUP BY (canon_id) 
+            ORDER BY totals 
+            DESC LIMIT 10) 
+            AS donation_totals 
+        ON as_donors.donor_id = donation_totals.canon_id
+        ORDER BY totals DESC
+    """
+    cur = dict_cursor_execute(q, database=config.DATABASE)
+
+    print("Top Donors (deduped)")
+    for row in cur:
+        row['totals'] = locale.currency(row['totals'], grouping=True)
+        print('%(totals)20s: %(name)s' % row)
+
+    # Compare this to what we would have gotten if we hadn't done any
+    # deduplication
+    q = """
+        with donorscontributions as(
+
+            SELECT as_donors.donor_id, 
+                array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,
+                cast(as_contributions.amount as double) as amount
+            FROM as_donors INNER JOIN as_contributions 
+                USING (donor_id) 
+            )
+        SELECT name, sum(amount) AS totals  
+        FROM donorscontributions
+        GROUP BY donor_id, name
+        ORDER BY totals DESC 
+        LIMIT 10
+    """
+    cur = dict_cursor_execute(q, database=config.DATABASE)
+
+    print("Top Donors (raw)")
+    for row in cur:
+        row['totals'] = locale.currency(row['totals'], grouping=True)
+        print('%(totals)20s: %(name)s' % row)
+
+    print('ran in', time.time() - start_time, 'seconds')
diff --git a/athena_example/utils.py b/athena_example/utils.py
new file mode 100644
index 00000000..1b8b935a
--- /dev/null
+++ b/athena_example/utils.py
@@ -0,0 +1,139 @@
+from __future__ import print_function
+import re
+import boto3
+import botocore
+import sys
+import datetime
+import os
+import time
+import pandas as pd
+from six import string_types
+import sys
+pyver = sys.version_info[0]
+
+if pyver<3:
+    from StringIO import StringIO as SomethingIO
+    from urlparse import urlparse
+else:
+    from io import BytesIO as SomethingIO
+    from urllib.parse import urlparse
+    
+sys.path.insert(0, '../athena_example/')
+import config
+
+s3 = boto3.client('s3', region_name=config.REGION, 
+                      aws_access_key_id=config.ACCESS_KEY_ID, aws_secret_access_key=config.SECRET_ACCESS_KEY)
+  
+athena = boto3.client('athena', region_name=config.REGION, 
+                      aws_access_key_id=config.ACCESS_KEY_ID, aws_secret_access_key=config.SECRET_ACCESS_KEY)
+
+def athena_to_panda(query, database=config.DATABASE, output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, **kwargs):
+    query_execution_id = athena_start_query(query, database, output_location, region, workgroup, wait_until_finished=True)
+    df = pandas_read_csv(os.path.join(output_location, query_execution_id+'.csv'), **kwargs)
+    return df
+
+
+def athena_start_query(query, database=config.DATABASE, output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, wait_until_finished=True):
+    query_execution_id = athena.start_query_execution(
+        QueryString=query,
+        QueryExecutionContext={
+            'Database': database
+        },    
+        WorkGroup=workgroup,
+        ResultConfiguration={
+            "OutputLocation": output_location
+        }
+    )['QueryExecutionId']
+
+    seconds_to_wait = 1
+
+    if wait_until_finished:
+        while True:
+            time.sleep(seconds_to_wait)
+            seconds_to_wait += 1
+#             seconds_to_wait *= 2
+
+            execution = athena.get_query_execution(
+                QueryExecutionId=query_execution_id
+            )
+
+            if execution['QueryExecution']['Status']['State'] not in ['QUEUED', 'RUNNING']:
+                break
+
+        if execution['QueryExecution']['Status']['State'] != 'SUCCEEDED':
+            raise Exception("Athena query failed: %s" % ( execution['QueryExecution']['Status']['StateChangeReason'],), query_execution_id)
+
+    return query_execution_id
+
+# Copied from https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
+# Import it instead, when it's updated.
+def is_s3_url(url):
+    """Check for an s3, s3n, or s3a url"""
+    try:
+        return urlparse(url).scheme in ["s3", "s3n", "s3a"]
+    except Exception:
+        return False
+    
+def seperate_bucket_key(url):
+    m = re.match('s3://([^/]+)/(.*)', url)
+    return m.group(1), m.group(2)
+
+def list_all(path):
+    if is_s3_url(path):
+        bucket, key = seperate_bucket_key(path)
+        objects = s3.list_objects_v2(Bucket=bucket, Prefix=key)
+        return [key['Key'] for key in objects['Contents']]
+    from os import listdir
+    from os.path import isfile, join
+    return listdir(path)
+    
+
+def pandas_read_csv(filepath_or_buffer, verbose=True, **kwargs):
+    bucket, key = seperate_bucket_key(filepath_or_buffer)
+    obj = s3.get_object(Bucket=bucket, Key=key)
+    return pd.read_csv(SomethingIO(obj['Body'].read()),  **kwargs)
+
+def read(filename, verbose=True):
+    log ("Reading {}".format(filename), verbose=verbose)
+    if is_s3_url(filename):
+        bucket, key = seperate_bucket_key(filename)
+        obj=s3.get_object(Bucket=bucket, Key=key)
+        return obj['Body'].read()
+    with open (filename) as f:
+        return f.read()
+
+def write(body, filename):
+    bucket, key = seperate_bucket_key(filename)
+    s3.put_object(Bucket=bucket, Key=key, Body=body)
+    return
+        
+    
+def file_exists(filename):
+    bucket, key = seperate_bucket_key(filename)
+    try:
+        s3.get_object(Bucket=bucket, Key=key)
+    except botocore.exceptions.ClientError as e:
+        if e.response['Error']['Code']=='NoSuchKey':
+            return False
+        else:
+            # Something else has gone wrong.
+            raise
+    else:
+        return True
+    
+    
+def log(outstr, logfile_name=config.LOG_FILE, timestamped=True, verbose=True, quiet=False):
+    if verbose == False:
+        return
+    if timestamped:
+        outstr = "[%s]\t%s\n" % (str(datetime.datetime.now()) , outstr)
+    else:
+        outstr = "%s\n" % (outstr,)
+
+    with open(logfile_name, "a") as logfile:
+        logfile.write(outstr)
+
+    if not quiet:
+        sys.stdout.write(outstr);
+        sys.stdout.flush()
+# Print iterations progress
diff --git a/notebooks/athena_example.ipynb b/notebooks/athena_example.ipynb
deleted file mode 100644
index 69edb207..00000000
--- a/notebooks/athena_example.ipynb
+++ /dev/null
@@ -1,452 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Overwriting ../athena_example/athena_example.py\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%writefile ../athena_example/athena_example.py\n",
-    "\n",
-    "\"\"\"\n",
-    "This is an example of working with very large data. There are about\n",
-    "700,000 unduplicated donors in this database of Illinois political\n",
-    "campaign contributions.\n",
-    "\n",
-    "With such a large set of input data, we cannot store all the comparisons\n",
-    "we need to make in memory. Instead, we will read the pairs on demand\n",
-    "from the Athena database.\n",
-    "\n",
-    "__Note:__ You will need to run `python mysql_init_db.py`\n",
-    "before running this script. See the annotates source for\n",
-    "[mysql_init_db.py](mysql_init_db.html)\n",
-    "\n",
-    "For smaller datasets (<10,000), see our\n",
-    "[csv_example](csv_example.html)\n",
-    "\"\"\"\n",
-    "\n",
-    "# There is a little bit difference between the result \n",
-    "# of this module and the mysql one. The reason is due to\n",
-    "# Some special (and mostly erroneous) characters, such as \\a .. \n",
-    "# Which are dealt with differently by mysql and athena/panda\n",
-    "\n",
-    "import sys\n",
-    "import os\n",
-    "import itertools\n",
-    "import time\n",
-    "import logging\n",
-    "import optparse\n",
-    "import locale\n",
-    "import json\n",
-    "from io import StringIO\n",
-    "import csv\n",
-    "import pandas as pd\n",
-    "\n",
-    "import boto3\n",
-    "import dedupe\n",
-    "import dedupe.backport\n",
-    "sys.path.insert(0, '../athena_example/')\n",
-    "import config\n",
-    "sys.path.insert(0, '../athena_example/')\n",
-    "import athenautils\n",
-    "\n",
-    "def cursor_execute(query, database):\n",
-    "    '''\n",
-    "    The MySQL compatible Cursor\n",
-    "    '''\n",
-    "    return athenautils.cursor_execute(query, database=database, \n",
-    "                                      cursortype='tuple', buffersize=config.BUFFERSIZE,\n",
-    "                                      escapechar=None, keep_default_na=False, na_values=[''])\n",
-    "\n",
-    "def dict_cursor_execute(query, database):\n",
-    "    '''\n",
-    "    The MySQL compatible DicCursor\n",
-    "    '''\n",
-    "    return athenautils.cursor_execute(query, database=database, \n",
-    "                                      cursortype='dict', buffersize=config.BUFFERSIZE,\n",
-    "                                      escapechar=None, keep_default_na=False, na_values=[''])\n",
-    "def record_pairs(result_set):\n",
-    "    for i, row in enumerate(result_set):\n",
-    "        a_record_id, a_record, b_record_id, b_record = row\n",
-    "        record_a = (a_record_id, json.loads(a_record))\n",
-    "        record_b = (b_record_id, json.loads(b_record))\n",
-    "\n",
-    "        yield record_a, record_b\n",
-    "\n",
-    "        if i % 10000 == 0:\n",
-    "            print(i)\n",
-    "\n",
-    "\n",
-    "def cluster_ids(clustered_dupes):\n",
-    "\n",
-    "    for cluster, scores in clustered_dupes:\n",
-    "        cluster_id = cluster[0]\n",
-    "        for donor_id, score in zip(cluster, scores):\n",
-    "            yield donor_id, cluster_id, score\n",
-    "\n",
-    "\n",
-    "if __name__ == '__main__':\n",
-    "\n",
-    "    ## Logging\n",
-    "\n",
-    "    # Dedupe uses Python logging to show or suppress verbose output. Added\n",
-    "    # for convenience.  To enable verbose output, run `python\n",
-    "    # examples/mysql_example/mysql_example.py -v`\n",
-    "    \n",
-    "    optp = optparse.OptionParser()\n",
-    "    optp.add_option('-v', '--verbose', dest='verbose', action='count',\n",
-    "                    help='Increase verbosity (specify multiple times for more)'\n",
-    "                    )\n",
-    "    (opts, args) = optp.parse_args()\n",
-    "    log_level = logging.WARNING\n",
-    "    if opts.verbose:\n",
-    "        if opts.verbose == 1:\n",
-    "            log_level = logging.INFO\n",
-    "        elif opts.verbose >= 2:\n",
-    "            log_level = logging.DEBUG\n",
-    "\n",
-    "\n",
-    "    logging.getLogger().setLevel(log_level)\n",
-    "\n",
-    "    \n",
-    "\n",
-    "\n",
-    "    settings_file = 'mysql_example_settings'\n",
-    "    training_file = 'mysql_example_training.json'\n",
-    "\n",
-    "    start_time = time.time()\n",
-    "\n",
-    "    # We'll be using variations on this following select statement to pull\n",
-    "    # in campaign donor info.\n",
-    "    #\n",
-    "    # We did a fair amount of preprocessing of the fields in\n",
-    "    # `mysql_init_db.py`    \n",
-    "    DONOR_SELECT = \"\"\"SELECT donor_id, city, name, zip, state, address\n",
-    "                      from as_processed_donors\"\"\"\n",
-    "\n",
-    "    # ## Training\n",
-    "\n",
-    "    if os.path.exists(settings_file):\n",
-    "        print('reading from ', settings_file)\n",
-    "        with open(settings_file, 'rb') as sf:\n",
-    "            deduper = dedupe.StaticDedupe(sf, num_cores=4)\n",
-    "    else:\n",
-    "        # Define the fields dedupe will pay attention to\n",
-    "        #\n",
-    "        # The address, city, and zip fields are often missing, so we'll\n",
-    "        # tell dedupe that, and we'll learn a model that take that into\n",
-    "        # account\n",
-    "        fields = [{'field': 'name', 'type': 'String'},\n",
-    "                  {'field': 'address', 'type': 'String',\n",
-    "                   'has missing': True},\n",
-    "                  {'field': 'city', 'type': 'ShortString', 'has missing': True},\n",
-    "                  {'field': 'state', 'type': 'ShortString', 'has missing': True},\n",
-    "                  {'field': 'zip', 'type': 'ShortString', 'has missing': True},\n",
-    "                  ]\n",
-    "\n",
-    "        # Create a new deduper object and pass our data model to it.\n",
-    "        deduper = dedupe.Dedupe(fields, num_cores=4)\n",
-    "\n",
-    "        # We will sample pairs from the entire donor table for training\n",
-    "        cur = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE)\n",
-    "        temp_d = {i: row for i, row in enumerate(cur)}\n",
-    "            \n",
-    "\n",
-    "        # If we have training data saved from a previous run of dedupe,\n",
-    "        # look for it an load it in.\n",
-    "        #\n",
-    "        # __Note:__ if you want to train from\n",
-    "        # scratch, delete the training_file\n",
-    "        if os.path.exists(training_file):\n",
-    "            print('reading labeled examples from ', training_file)\n",
-    "            with open(training_file) as tf:\n",
-    "                deduper.prepare_training(temp_d, training_file=tf)\n",
-    "        else:\n",
-    "            deduper.prepare_training(temp_d)\n",
-    "\n",
-    "        del temp_d\n",
-    "\n",
-    "        # ## Active learning\n",
-    "\n",
-    "        print('starting active labeling...')\n",
-    "        # Starts the training loop. Dedupe will find the next pair of records\n",
-    "        # it is least certain about and ask you to label them as duplicates\n",
-    "        # or not.\n",
-    "\n",
-    "        # use 'y', 'n' and 'u' keys to flag duplicates\n",
-    "        # press 'f' when you are finished\n",
-    "        dedupe.convenience.console_label(deduper)\n",
-    "        # When finished, save our labeled, training pairs to disk\n",
-    "        with open(training_file, 'w') as tf:\n",
-    "            deduper.write_training(tf)\n",
-    "\n",
-    "        # Notice our the argument here\n",
-    "        #\n",
-    "        # `recall` is the proportion of true dupes pairs that the learned\n",
-    "        # rules must cover. You may want to reduce this if your are making\n",
-    "        # too many blocks and too many comparisons.\n",
-    "        deduper.train(recall=0.90)\n",
-    "\n",
-    "        with open(settings_file, 'wb') as sf:\n",
-    "            deduper.write_settings(sf)\n",
-    "\n",
-    "        # We can now remove some of the memory hobbing objects we used\n",
-    "        # for training\n",
-    "        deduper.cleanup_training()\n",
-    "\n",
-    "    # ## Blocking\n",
-    "\n",
-    "    print('blocking...')\n",
-    "\n",
-    "    # To run blocking on such a large set of data, we create a separate table\n",
-    "    # that contains blocking keys and record ids\n",
-    "    print('creating as_blocking_map database')\n",
-    "    athenautils.drop_external_table(\"as_blocking_map\", \n",
-    "                                    location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map'),\n",
-    "                                    database=config.DATABASE)\n",
-    "\n",
-    "    q=\"\"\"\n",
-    "    CREATE EXTERNAL TABLE as_blocking_map     \n",
-    "        (block_key VARCHAR(200), donor_id INTEGER)\n",
-    "    ROW FORMAT DELIMITED\n",
-    "      FIELDS TERMINATED BY '\\t'\n",
-    "      LINES TERMINATED BY '\\n'  \n",
-    "    LOCATION\n",
-    "        's3://{}/{}' \n",
-    "    TBLPROPERTIES (\n",
-    "        'classification'='csv', \n",
-    "        --'skip.header.line.count'='1',  \n",
-    "        'serialization.null.format'='')\n",
-    "    \"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map') \n",
-    "    athenautils.athena_start_query(q, database=config.DATABASE)\n",
-    "\n",
-    "    # If dedupe learned a Index Predicate, we have to take a pass\n",
-    "    # through the data and create indices.\n",
-    "    print('creating inverted index')\n",
-    "\n",
-    "    # Armin: \n",
-    "    # This never runs, index_fields is empty, possible bug?\n",
-    "    for field in deduper.fingerprinter.index_fields:\n",
-    "        q = \"\"\"\n",
-    "        SELECT DISTINCT {field} FROM as_processed_donors\n",
-    "        WHERE {field} IS NOT NULL\n",
-    "        \"\"\".format(field=field)\n",
-    "        cur = dict_cursor_execute(q, databse=config.DATABASE)\n",
-    "        field_data = (row[field] for row in cur)\n",
-    "        deduper.fingerprinter.index(field_data, field)\n",
-    "     \n",
-    "\n",
-    "    # Now we are ready to write our blocking map table by creating a\n",
-    "    # generator that yields unique `(block_key, donor_id)` tuples.\n",
-    "    print('writing blocking map')\n",
-    "    \n",
-    "    read_cur  = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE)\n",
-    "    full_data = ((row['donor_id'], row) for row in read_cur)\n",
-    "\n",
-    "    b_data = deduper.fingerprinter(full_data)\n",
-    "    athenautils.write_many(b_data, \n",
-    "                           filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map/blocking.csv'))\n",
-    "\n",
-    "\n",
-    "    # select unique pairs to compare\n",
-    "    q=\"\"\"\n",
-    "        SELECT a.donor_id,\n",
-    "            json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'],\n",
-    "                                  ARRAY[ a.city, a.name, a.zip, a.state, a.address])\n",
-    "                        AS JSON)),\n",
-    "            b.donor_id,\n",
-    "            json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], \n",
-    "                      ARRAY[ b.city, b.name, b.zip, b.state, b.address])\n",
-    "                  AS JSON))\n",
-    "        FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west\n",
-    "             from as_blocking_map as l\n",
-    "             INNER JOIN as_blocking_map as r\n",
-    "             using (block_key)\n",
-    "             where l.donor_id < r.donor_id) ids\n",
-    "        INNER JOIN as_processed_donors a on ids.east=a.donor_id\n",
-    "        INNER JOIN as_processed_donors b on ids.west=b.donor_id\n",
-    "       \"\"\"\n",
-    "    read_cur = cursor_execute(q, database=config.DATABASE)\n",
-    "\n",
-    "\n",
-    "    # ## Clustering\n",
-    "\n",
-    "    print('clustering...')\n",
-    "    clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)),\n",
-    "                                      threshold=0.5)\n",
-    "\n",
-    "#     athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_entity_map\", database=config.DATABASE)\n",
-    "    athenautils.drop_external_table(\"as_entity_map\", \n",
-    "                                    location='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/'), \n",
-    "                                    database=config.DATABASE)\n",
-    "    \n",
-    "    print('creating as_entity_map database')\n",
-    "    q=\"\"\"\n",
-    "    CREATE EXTERNAL TABLE as_entity_map     \n",
-    "        (donor_id INTEGER, canon_id INTEGER, \n",
-    "         cluster_score FLOAT)\n",
-    "    ROW FORMAT DELIMITED\n",
-    "      FIELDS TERMINATED BY '\\t'\n",
-    "      LINES TERMINATED BY '\\n'  \n",
-    "    LOCATION\n",
-    "        's3://{}/{}' \n",
-    "    TBLPROPERTIES (\n",
-    "        'classification'='csv', \n",
-    "        --'skip.header.line.count'='1',  \n",
-    "        'serialization.null.format'='')\n",
-    "    \"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map') \n",
-    "    athenautils.athena_start_query(q, database=config.DATABASE) \n",
-    "\n",
-    "    athenautils.write_many(cluster_ids(clustered_dupes),\n",
-    "                          filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/entity_map.csv'))\n",
-    "\n",
-    "    # Print out the number of duplicates found\n",
-    "    print('# duplicate sets')\n",
-    "\n",
-    "    # ## Payoff\n",
-    "\n",
-    "    # With all this done, we can now begin to ask interesting questions\n",
-    "    # of the data\n",
-    "    #\n",
-    "    # For example, let's see who the top 10 donors are.\n",
-    "\n",
-    "    locale.setlocale(locale.LC_ALL, 'en_CA.UTF-8')  # for pretty printing numbers\n",
-    "    \n",
-    "    athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_e_map\", database=config.DATABASE)\n",
-    "    \n",
-    "    q = \"\"\"\n",
-    "        CREATE TABLE as_e_map as \n",
-    "        SELECT COALESCE(canon_id, as_entity_map.donor_id) AS canon_id, as_entity_map.donor_id \n",
-    "        FROM as_entity_map \n",
-    "        RIGHT JOIN as_donors USING(donor_id)        \n",
-    "        \"\"\"    \n",
-    "    athenautils.athena_start_query(q, database=config.DATABASE)\n",
-    "    \n",
-    "    q = \"\"\"\n",
-    "        SELECT array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,   \n",
-    "            donation_totals.totals AS totals \n",
-    "        FROM as_donors INNER JOIN \n",
-    "            (SELECT canon_id, SUM(cast (amount as double)) AS totals \n",
-    "            FROM as_contributions INNER JOIN as_e_map \n",
-    "            USING (donor_id) \n",
-    "            GROUP BY (canon_id) \n",
-    "            ORDER BY totals \n",
-    "            DESC LIMIT 10) \n",
-    "            AS donation_totals \n",
-    "        ON as_donors.donor_id = donation_totals.canon_id\n",
-    "        ORDER BY totals DESC\n",
-    "    \"\"\"\n",
-    "    cur = dict_cursor_execute(q, database=config.DATABASE)\n",
-    "\n",
-    "    print(\"Top Donors (deduped)\")\n",
-    "    for row in cur:\n",
-    "        row['totals'] = locale.currency(row['totals'], grouping=True)\n",
-    "        print('%(totals)20s: %(name)s' % row)\n",
-    "\n",
-    "    # Compare this to what we would have gotten if we hadn't done any\n",
-    "    # deduplication\n",
-    "    q = \"\"\"\n",
-    "        with donorscontributions as(\n",
-    "\n",
-    "            SELECT as_donors.donor_id, \n",
-    "                array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,\n",
-    "                cast(as_contributions.amount as double) as amount\n",
-    "            FROM as_donors INNER JOIN as_contributions \n",
-    "                USING (donor_id) \n",
-    "            )\n",
-    "        SELECT name, sum(amount) AS totals  \n",
-    "        FROM donorscontributions\n",
-    "        GROUP BY donor_id, name\n",
-    "        ORDER BY totals DESC \n",
-    "        LIMIT 10\n",
-    "    \"\"\"\n",
-    "    cur = dict_cursor_execute(q, database=config.DATABASE)\n",
-    "\n",
-    "    print(\"Top Donors (raw)\")\n",
-    "    for row in cur:\n",
-    "        row['totals'] = locale.currency(row['totals'], grouping=True)\n",
-    "        print('%(totals)20s: %(name)s' % row)\n",
-    "\n",
-    "    print('ran in', time.time() - start_time, 'seconds')\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "^C\r\n",
-      "Traceback (most recent call last):\r\n",
-      "  File \"../athena_example/athena_example.py\", line 156, in <module>\r\n",
-      "    deduper.prepare_training(temp_d)\r\n",
-      "  File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/api.py\", line 1249, in prepare_training\r\n",
-      "    self._sample(data, sample_size, blocked_proportion, original_length)\r\n",
-      "  File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/api.py\", line 1287, in _sample\r\n",
-      "    index_include=examples)\r\n",
-      "  File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/labeler.py\", line 418, in __init__\r\n",
-      "    index_include)\r\n",
-      "  File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/labeler.py\", line 246, in __init__\r\n",
-      "    index_data)\r\n",
-      "  File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/training.py\", line 128, in __init__\r\n",
-      "    simple_cover = self.coveredPairs(self.blocker, sampled_records)\r\n",
-      "  File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/training.py\", line 156, in coveredPairs\r\n",
-      "    for block in pred_cover.values()\r\n",
-      "  File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/dedupe/training.py\", line 157, in <setcomp>\r\n",
-      "    for pair in itertools.combinations(sorted(block), 2)}\r\n",
-      "KeyboardInterrupt\r\n"
-     ]
-    }
-   ],
-   "source": [
-    "!python ../athena_example/athena_example.py"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "conda_python3",
-   "language": "python",
-   "name": "conda_python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.10"
-  },
-  "widgets": {
-   "application/vnd.jupyter.widget-state+json": {
-    "state": {},
-    "version_major": 2,
-    "version_minor": 0
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/notebooks/athena_init_db.ipynb b/notebooks/athena_init_db.ipynb
deleted file mode 100644
index d35250de..00000000
--- a/notebooks/athena_init_db.ipynb
+++ /dev/null
@@ -1,277 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# !pip install dedupe"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%writefile ../athena_example/config.py\n",
-    "LOG_FILE = 'log.txt'\n",
-    "\n",
-    "# Connection parameters\n",
-    "ACCESS_KEY_ID = None\n",
-    "SECRET_ACCESS_KEY = None\n",
-    "ATHENA_GARBAGE_PATH = 's3://aws-athena-query-results-rds'\n",
-    "WORKGROUP = 'RDS'\n",
-    "REGION = 'eu-west-1'\n",
-    "DATABASE = 'ria_tmp'\n",
-    "\n",
-    "# Database Parameters\n",
-    "DATABASE_BUCKET = 'ria-temp'\n",
-    "DATABASE_ROOT_KEY = 'as_dedupe/'\n",
-    "BUFFERSIZE = 100000"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%writefile ../athena_example/athena_init.py\n",
-    "#!/usr/bin/python\n",
-    "\"\"\"\n",
-    "This is a setup script for athena_example.  It downloads a zip file of\n",
-    "Illinois campaign contributions and loads them into a Athena database\n",
-    "named 'contributions'.\n",
-    " \n",
-    "__Note:__ You will need to run this script first before execuing\n",
-    "[athena_example.py](athena_example.py).\n",
-    " \n",
-    "Tables created:\n",
-    "* as_raw_table - raw import of entire CSV file\n",
-    "* donors - all distinct donors based on name and address\n",
-    "* recipients - all distinct campaign contribution recipients\n",
-    "* contributions - contribution amounts tied to donor and recipients tables\n",
-    "\"\"\"\n",
-    "\n",
-    "import os\n",
-    "import zipfile\n",
-    "import warnings\n",
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "from urllib.request import urlopen\n",
-    "import boto3\n",
-    "import config\n",
-    "import csv\n",
-    "import sys\n",
-    "sys.path.insert(0, '../athena_example/')\n",
-    "import athenautils\n",
-    "\n",
-    "\n",
-    "contributions_zip_file = 'Illinois-campaign-contributions.txt.zip'\n",
-    "contributions_txt_file = 'Illinois-campaign-contributions.txt'\n",
-    "\n",
-    "if not os.path.exists(contributions_zip_file) :\n",
-    "    print('downloading', contributions_zip_file, '(~60mb) ...')\n",
-    "    u = urlopen('https://s3.amazonaws.com/dedupe-data/Illinois-campaign-contributions.txt.zip')\n",
-    "    localFile = open(contributions_zip_file, 'wb')\n",
-    "    localFile.write(u.read())\n",
-    "    localFile.close()\n",
-    "\n",
-    "if not os.path.exists(contributions_txt_file) :\n",
-    "    zip_file = zipfile.ZipFile(contributions_zip_file, 'r')\n",
-    "    print('extracting %s' % contributions_zip_file)\n",
-    "    zip_file_contents = zip_file.namelist()\n",
-    "    for f in zip_file_contents:\n",
-    "        if ('.txt' in f):\n",
-    "            zip_file.extract(f)\n",
-    "    zip_file.close()\n",
-    "\n",
-    "\n",
-    "\n",
-    "\n",
-    "print('importing raw data from csv...')\n",
-    "athenautils.drop_external_table(\"as_raw_table\", \n",
-    "                                location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table'),\n",
-    "                                database=config.DATABASE)    \n",
-    "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_donors\", database=config.DATABASE)\n",
-    "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_recipients\", database=config.DATABASE)\n",
-    "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_contributions\", database=config.DATABASE)\n",
-    "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_processed_donors\", database=config.DATABASE)\n",
-    "\n",
-    "\n",
-    "q=r\"\"\"\n",
-    "CREATE EXTERNAL TABLE as_raw_table \n",
-    "    (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), \n",
-    "    address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), \n",
-    "    state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), \n",
-    "    date_recieved VARCHAR(10), loan_amount VARCHAR(12), \n",
-    "    amount VARCHAR(23), receipt_type VARCHAR(23), \n",
-    "    employer VARCHAR(70), occupation VARCHAR(40), \n",
-    "    vendor_last_name VARCHAR(70), vendor_first_name VARCHAR(20), \n",
-    "    vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), \n",
-    "    vendor_city VARCHAR(20), vendor_state VARCHAR(10), \n",
-    "    vendor_zip VARCHAR(10), description VARCHAR(90), \n",
-    "    election_type VARCHAR(10), election_year VARCHAR(10), \n",
-    "    report_period_begin VARCHAR(10), report_period_end VARCHAR(33), \n",
-    "    committee_name VARCHAR(70), committee_id VARCHAR(37)) \n",
-    "ROW FORMAT DELIMITED\n",
-    "  FIELDS TERMINATED BY '\\t'\n",
-    "  ESCAPED BY '\\\\'\n",
-    "  LINES TERMINATED BY '\\n'  \n",
-    "LOCATION\n",
-    "    's3://{}/{}' \n",
-    "TBLPROPERTIES (\n",
-    "    'classification'='csv', \n",
-    "    'skip.header.line.count'='1',  \n",
-    "    'serialization.null.format'='')\n",
-    "\"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table') \n",
-    "athenautils.athena_start_query(q, database=config.DATABASE)\n",
-    "\n",
-    "\n",
-    "df_cursor = pd.read_csv(contributions_txt_file, sep='\\t', escapechar='\\\\', quoting=csv.QUOTE_NONE,  \n",
-    "                        error_bad_lines=False, warn_bad_lines=True, dtype=str, keep_default_na=False, na_values=[''],\n",
-    "                        chunksize=config.BUFFERSIZE)\n",
-    "chunkcount = 0\n",
-    "filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'as_raw_table', os.path.splitext(contributions_txt_file)[0]+'.csv')\n",
-    "for df in df_cursor: \n",
-    "    # Remove the very few records that mess up the demo \n",
-    "    # (demo purposes only! Don't do something like this in production)\n",
-    "    df = df[df['RcvDate'].str.len()>=10]\n",
-    "\n",
-    "    # set empty, non-zero, strings in date columns to null\n",
-    "    df.loc[df['RptPdBegDate'].str.len()<10,'RptPdBegDate'] = np.nan\n",
-    "\n",
-    "    df.loc[df['RptPdEndDate'].str.len()<10,'RptPdEndDate'] = np.nan\n",
-    "\n",
-    "    #committee ID is requred. Remove the 2 rows that don't have it.\n",
-    "    df = df[df['ID']!='']\n",
-    "\n",
-    "    # There's a record with a date stuck in the committee_id column, which causes\n",
-    "    # problems when inserting into the contributions table below. Get rid of it this \n",
-    "    # way.\n",
-    "    df = df[df['ID'].str.len() <=9]\n",
-    "\n",
-    "    # dropping the last columns\n",
-    "    df = df.drop(columns='Unnamed: 29')\n",
-    "\n",
-    "    df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand')\n",
-    "    \n",
-    "    buffer = df_lower.to_csv(quoting=csv.QUOTE_NONE, sep=\"\\t\", escapechar='\\\\', index=None)\n",
-    "    \n",
-    "    chunk_fname = athenautils.file_name_append(filename, '_{}'.format(chunkcount), ommitext=False)\n",
-    "    athenautils.write(body=buffer, filename=chunk_fname)\n",
-    "    chunkcount += 1    \n",
-    "    \n",
-    "print('creating donors table...')\n",
-    "q=\"\"\"\n",
-    "CREATE TABLE as_donors as\n",
-    "    with tmp as\n",
-    "      (SELECT DISTINCT \n",
-    "           NULLIF(TRIM(last_name), '') as last_name, \n",
-    "           NULLIF(TRIM(first_name), '') as first_name, \n",
-    "           NULLIF(TRIM(address_1), '') as address_1, \n",
-    "           NULLIF(TRIM(address_2), '') as address_2, \n",
-    "           NULLIF(TRIM(city), '') city, \n",
-    "           NULLIF(TRIM(state), '') as state, \n",
-    "           NULLIF(TRIM(zip), '') as zip, \n",
-    "           NULLIF(TRIM(employer), '') as employer, \n",
-    "           NULLIF(TRIM(occupation), '') as occupation\n",
-    "      FROM as_raw_table)\n",
-    "    SELECT row_number() over () as donor_id, * from tmp\"\"\"\n",
-    "athenautils.athena_start_query(q, database=config.DATABASE)\n",
-    "\n",
-    "\n",
-    "q=\"\"\"\n",
-    "CREATE TABLE as_recipients as\n",
-    "    SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM as_raw_table\n",
-    "\"\"\"\n",
-    "athenautils.athena_start_query(q, database=config.DATABASE)\n",
-    "\n",
-    "print('creating contributions table')\n",
-    "\n",
-    "q=\"\"\"\n",
-    "CREATE TABLE as_contributions as\n",
-    "    SELECT reciept_id as contribution_id, \n",
-    "        donors.donor_id as donor_id , \n",
-    "        committee_id as recipient_id, \n",
-    "        report_type, date_parse(date_recieved, '%m/%d/%Y') as date_recieved, \n",
-    "        loan_amount, amount, \n",
-    "        receipt_type, vendor_last_name , \n",
-    "        vendor_first_name, vendor_address_1, vendor_address_2, \n",
-    "        vendor_city, vendor_state, vendor_zip, description, \n",
-    "        election_type, election_year, \n",
-    "        date_parse(report_period_begin, '%m/%d/%Y') as report_period_begin, \n",
-    "        date_parse(report_period_end, '%m/%d/%Y') as report_period_end \n",
-    "    FROM as_raw_table JOIN as_donors donors ON \n",
-    "        coalesce(donors.first_name, '') = coalesce(TRIM(as_raw_table.first_name), '') AND \n",
-    "        coalesce(donors.last_name, '') = coalesce(TRIM(as_raw_table.last_name), '') AND \n",
-    "        coalesce(donors.address_1, '') = coalesce(TRIM(as_raw_table.address_1), '') AND \n",
-    "        coalesce(donors.address_2, '') = coalesce(TRIM(as_raw_table.address_2), '') AND \n",
-    "        coalesce(donors.city, '') = coalesce(TRIM(as_raw_table.city), '') AND \n",
-    "        coalesce(donors.state, '') = coalesce(TRIM(as_raw_table.state), '') AND \n",
-    "        coalesce(donors.employer, '') = coalesce(TRIM(as_raw_table.employer), '') AND \n",
-    "        coalesce(donors.occupation , '')= coalesce(TRIM(as_raw_table.occupation), '') AND \n",
-    "        coalesce(donors.zip, '') = coalesce(TRIM(as_raw_table.zip), '')\"\"\"\n",
-    "\n",
-    "athenautils.athena_start_query(q, database=config.DATABASE)\n",
-    "\n",
-    "q = \"\"\"\n",
-    "CREATE TABLE as_processed_donors AS  \n",
-    "    SELECT donor_id,  \n",
-    "     LOWER(city) AS city,  \n",
-    "     CASE WHEN (first_name IS NULL AND last_name IS NULL) \n",
-    "          THEN NULL \n",
-    "          ELSE LOWER(array_join(filter(array[first_name, last_name], x-> x IS NOT NULL), ' ')) \n",
-    "     END AS name,  \n",
-    "     LOWER(zip) AS zip,  \n",
-    "     LOWER(state) AS state,  \n",
-    "     CASE WHEN (address_1 IS NULL AND address_2 IS NULL) \n",
-    "          THEN NULL \n",
-    "          ELSE LOWER(array_join(filter(array[address_1, address_2], x-> x IS NOT NULL), ' '))\n",
-    "     END AS address,  \n",
-    "     LOWER(occupation) AS occupation, \n",
-    "     LOWER(employer) AS employer, \n",
-    "     first_name is null AS person \n",
-    " FROM as_donors\"\"\"\n",
-    "athenautils.athena_start_query(q, database=config.DATABASE)\n",
-    "\n",
-    "\n",
-    "\n",
-    "\n",
-    "print('done')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!python ../athena_example/athena_init.py"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "conda_python3",
-   "language": "python",
-   "name": "conda_python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}

From 3e1bb9657134c12bee1e41e056c87c2d9c27993d Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-10-10-10-91.eu-west-1.compute.internal>
Date: Thu, 5 Nov 2020 14:18:56 +0000
Subject: [PATCH 13/19] solving memeory issue

---
 notebooks/athena_example.ipynb | 418 +++++++++++++++++++++++++++++++++
 notebooks/athena_init_db.ipynb | 277 ++++++++++++++++++++++
 2 files changed, 695 insertions(+)
 create mode 100644 notebooks/athena_example.ipynb
 create mode 100644 notebooks/athena_init_db.ipynb

diff --git a/notebooks/athena_example.ipynb b/notebooks/athena_example.ipynb
new file mode 100644
index 00000000..089fb9e0
--- /dev/null
+++ b/notebooks/athena_example.ipynb
@@ -0,0 +1,418 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile ../athena_example/athena_example.py\n",
+    "\n",
+    "\"\"\"\n",
+    "This is an example of working with very large data. There are about\n",
+    "700,000 unduplicated donors in this database of Illinois political\n",
+    "campaign contributions.\n",
+    "\n",
+    "With such a large set of input data, we cannot store all the comparisons\n",
+    "we need to make in memory. Instead, we will read the pairs on demand\n",
+    "from the Athena database.\n",
+    "\n",
+    "__Note:__ You will need to run `python athena_init_db.py`\n",
+    "before running this script. See the annotates source for\n",
+    "[athena_init_db.py](athena_init_db.html)\n",
+    "\n",
+    "For smaller datasets (<10,000), see our\n",
+    "[csv_example](csv_example.html)\n",
+    "\"\"\"\n",
+    "\n",
+    "# There is a little bit difference between the result \n",
+    "# of this module and the athena one. The reason is due to\n",
+    "# Some special (and mostly erroneous) characters, such as \\a .. \n",
+    "# Which are dealt with differently by athena and athena/panda\n",
+    "\n",
+    "import sys\n",
+    "import os\n",
+    "import itertools\n",
+    "import time\n",
+    "import logging\n",
+    "import optparse\n",
+    "import locale\n",
+    "import json\n",
+    "from io import StringIO\n",
+    "import csv\n",
+    "import pandas as pd\n",
+    "\n",
+    "import boto3\n",
+    "import dedupe\n",
+    "import dedupe.backport\n",
+    "sys.path.insert(0, '../athena_example/')\n",
+    "import config\n",
+    "sys.path.insert(0, '../athena_example/')\n",
+    "import athenautils\n",
+    "\n",
+    "def cursor_execute(query, database):\n",
+    "    '''\n",
+    "    The MySQL compatible Cursor\n",
+    "    '''\n",
+    "    return athenautils.cursor_execute(query, database=database, \n",
+    "                                      cursortype='tuple', buffersize=config.BUFFERSIZE,\n",
+    "                                      escapechar=None, keep_default_na=False, na_values=[''])\n",
+    "\n",
+    "def dict_cursor_execute(query, database):\n",
+    "    '''\n",
+    "    The MySQL compatible DicCursor\n",
+    "    '''\n",
+    "    return athenautils.cursor_execute(query, database=database, \n",
+    "                                      cursortype='dict', buffersize=config.BUFFERSIZE,\n",
+    "                                      escapechar=None, keep_default_na=False, na_values=[''])\n",
+    "def record_pairs(result_set):\n",
+    "    for i, row in enumerate(result_set):\n",
+    "        a_record_id, a_record, b_record_id, b_record = row\n",
+    "        record_a = (a_record_id, json.loads(a_record))\n",
+    "        record_b = (b_record_id, json.loads(b_record))\n",
+    "\n",
+    "        yield record_a, record_b\n",
+    "\n",
+    "        if i % 10000 == 0:\n",
+    "            print(i)\n",
+    "\n",
+    "\n",
+    "def cluster_ids(clustered_dupes):\n",
+    "\n",
+    "    for cluster, scores in clustered_dupes:\n",
+    "        cluster_id = cluster[0]\n",
+    "        for donor_id, score in zip(cluster, scores):\n",
+    "            yield donor_id, cluster_id, score\n",
+    "\n",
+    "\n",
+    "if __name__ == '__main__':\n",
+    "\n",
+    "    ## Logging\n",
+    "\n",
+    "    # Dedupe uses Python logging to show or suppress verbose output. Added\n",
+    "    # for convenience.  To enable verbose output, run `python\n",
+    "    # examples/athena_example/athena_example.py -v`\n",
+    "    \n",
+    "    optp = optparse.OptionParser()\n",
+    "    optp.add_option('-v', '--verbose', dest='verbose', action='count',\n",
+    "                    help='Increase verbosity (specify multiple times for more)'\n",
+    "                    )\n",
+    "    (opts, args) = optp.parse_args()\n",
+    "    log_level = logging.WARNING\n",
+    "    if opts.verbose:\n",
+    "        if opts.verbose == 1:\n",
+    "            log_level = logging.INFO\n",
+    "        elif opts.verbose >= 2:\n",
+    "            log_level = logging.DEBUG\n",
+    "\n",
+    "\n",
+    "    logging.getLogger().setLevel(log_level)\n",
+    "\n",
+    "    \n",
+    "\n",
+    "\n",
+    "    settings_file = 'athena_example_settings'\n",
+    "    training_file = 'athena_example_training.json'\n",
+    "\n",
+    "    start_time = time.time()\n",
+    "\n",
+    "    # We'll be using variations on this following select statement to pull\n",
+    "    # in campaign donor info.\n",
+    "    #\n",
+    "    # We did a fair amount of preprocessing of the fields in\n",
+    "    # `athena_init_db.py`    \n",
+    "    DONOR_SELECT = \"\"\"SELECT donor_id, city, name, zip, state, address\n",
+    "                      from as_processed_donors\"\"\"\n",
+    "\n",
+    "    # ## Training\n",
+    "\n",
+    "    if os.path.exists(settings_file):\n",
+    "        print('reading from ', settings_file)\n",
+    "        with open(settings_file, 'rb') as sf:\n",
+    "            deduper = dedupe.StaticDedupe(sf, num_cores=4)\n",
+    "    else:\n",
+    "        # Define the fields dedupe will pay attention to\n",
+    "        #\n",
+    "        # The address, city, and zip fields are often missing, so we'll\n",
+    "        # tell dedupe that, and we'll learn a model that take that into\n",
+    "        # account\n",
+    "        fields = [{'field': 'name', 'type': 'String'},\n",
+    "                  {'field': 'address', 'type': 'String',\n",
+    "                   'has missing': True},\n",
+    "                  {'field': 'city', 'type': 'ShortString', 'has missing': True},\n",
+    "                  {'field': 'state', 'type': 'ShortString', 'has missing': True},\n",
+    "                  {'field': 'zip', 'type': 'ShortString', 'has missing': True},\n",
+    "                  ]\n",
+    "\n",
+    "        # Create a new deduper object and pass our data model to it.\n",
+    "        deduper = dedupe.Dedupe(fields, num_cores=4)\n",
+    "\n",
+    "        # We will sample pairs from the entire donor table for training\n",
+    "        cur = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE)\n",
+    "        temp_d = {i: row for i, row in enumerate(cur)}\n",
+    "            \n",
+    "\n",
+    "        # If we have training data saved from a previous run of dedupe,\n",
+    "        # look for it an load it in.\n",
+    "        #\n",
+    "        # __Note:__ if you want to train from\n",
+    "        # scratch, delete the training_file\n",
+    "        if os.path.exists(training_file):\n",
+    "            print('reading labeled examples from ', training_file)\n",
+    "            with open(training_file) as tf:\n",
+    "                deduper.prepare_training(temp_d, training_file=tf)\n",
+    "        else:\n",
+    "            deduper.prepare_training(temp_d)\n",
+    "\n",
+    "        del temp_d\n",
+    "\n",
+    "        # ## Active learning\n",
+    "\n",
+    "        print('starting active labeling...')\n",
+    "        # Starts the training loop. Dedupe will find the next pair of records\n",
+    "        # it is least certain about and ask you to label them as duplicates\n",
+    "        # or not.\n",
+    "\n",
+    "        # use 'y', 'n' and 'u' keys to flag duplicates\n",
+    "        # press 'f' when you are finished\n",
+    "        dedupe.convenience.console_label(deduper)\n",
+    "        # When finished, save our labeled, training pairs to disk\n",
+    "        with open(training_file, 'w') as tf:\n",
+    "            deduper.write_training(tf)\n",
+    "\n",
+    "        # Notice our the argument here\n",
+    "        #\n",
+    "        # `recall` is the proportion of true dupes pairs that the learned\n",
+    "        # rules must cover. You may want to reduce this if your are making\n",
+    "        # too many blocks and too many comparisons.\n",
+    "        deduper.train(recall=0.90)\n",
+    "\n",
+    "        with open(settings_file, 'wb') as sf:\n",
+    "            deduper.write_settings(sf)\n",
+    "\n",
+    "        # We can now remove some of the memory hobbing objects we used\n",
+    "        # for training\n",
+    "        deduper.cleanup_training()\n",
+    "\n",
+    "    # ## Blocking\n",
+    "\n",
+    "    print('blocking...')\n",
+    "\n",
+    "    # To run blocking on such a large set of data, we create a separate table\n",
+    "    # that contains blocking keys and record ids\n",
+    "    print('creating as_blocking_map database')\n",
+    "    athenautils.drop_external_table(\"as_blocking_map\", \n",
+    "                                    location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map'),\n",
+    "                                    database=config.DATABASE)\n",
+    "\n",
+    "    q=\"\"\"\n",
+    "    CREATE EXTERNAL TABLE as_blocking_map     \n",
+    "        (block_key VARCHAR(200), donor_id INTEGER)\n",
+    "    ROW FORMAT DELIMITED\n",
+    "      FIELDS TERMINATED BY '\\t'\n",
+    "      LINES TERMINATED BY '\\n'  \n",
+    "    LOCATION\n",
+    "        's3://{}/{}' \n",
+    "    TBLPROPERTIES (\n",
+    "        'classification'='csv', \n",
+    "        --'skip.header.line.count'='1',  \n",
+    "        'serialization.null.format'='')\n",
+    "    \"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map') \n",
+    "    athenautils.athena_start_query(q, database=config.DATABASE)\n",
+    "\n",
+    "    # If dedupe learned a Index Predicate, we have to take a pass\n",
+    "    # through the data and create indices.\n",
+    "    print('creating inverted index')\n",
+    "\n",
+    "    # Armin: \n",
+    "    # This never runs, index_fields is empty, possible bug?\n",
+    "    for field in deduper.fingerprinter.index_fields:\n",
+    "        q = \"\"\"\n",
+    "        SELECT DISTINCT {field} FROM as_processed_donors\n",
+    "        WHERE {field} IS NOT NULL\n",
+    "        \"\"\".format(field=field)\n",
+    "        cur = dict_cursor_execute(q, databse=config.DATABASE)\n",
+    "        field_data = (row[field] for row in cur)\n",
+    "        deduper.fingerprinter.index(field_data, field)\n",
+    "     \n",
+    "\n",
+    "    # Now we are ready to write our blocking map table by creating a\n",
+    "    # generator that yields unique `(block_key, donor_id)` tuples.\n",
+    "    print('writing blocking map')\n",
+    "    \n",
+    "    read_cur  = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE)\n",
+    "    full_data = ((row['donor_id'], row) for row in read_cur)\n",
+    "\n",
+    "    b_data = deduper.fingerprinter(full_data)\n",
+    "    athenautils.write_many(b_data, \n",
+    "                           filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map/blocking.csv'))\n",
+    "\n",
+    "\n",
+    "    # select unique pairs to compare\n",
+    "    q=\"\"\"\n",
+    "        SELECT a.donor_id,\n",
+    "            json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'],\n",
+    "                                  ARRAY[ a.city, a.name, a.zip, a.state, a.address])\n",
+    "                        AS JSON)),\n",
+    "            b.donor_id,\n",
+    "            json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], \n",
+    "                      ARRAY[ b.city, b.name, b.zip, b.state, b.address])\n",
+    "                  AS JSON))\n",
+    "        FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west\n",
+    "             from as_blocking_map as l\n",
+    "             INNER JOIN as_blocking_map as r\n",
+    "             using (block_key)\n",
+    "             where l.donor_id < r.donor_id) ids\n",
+    "        INNER JOIN as_processed_donors a on ids.east=a.donor_id\n",
+    "        INNER JOIN as_processed_donors b on ids.west=b.donor_id\n",
+    "       \"\"\"\n",
+    "    read_cur = cursor_execute(q, database=config.DATABASE)\n",
+    "\n",
+    "\n",
+    "    # ## Clustering\n",
+    "\n",
+    "    print('clustering...')\n",
+    "    clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)),\n",
+    "                                      threshold=0.5)\n",
+    "\n",
+    "#     athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_entity_map\", database=config.DATABASE)\n",
+    "    athenautils.drop_external_table(\"as_entity_map\", \n",
+    "                                    location='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/'), \n",
+    "                                    database=config.DATABASE)\n",
+    "    \n",
+    "    print('creating as_entity_map database')\n",
+    "    q=\"\"\"\n",
+    "    CREATE EXTERNAL TABLE as_entity_map     \n",
+    "        (donor_id INTEGER, canon_id INTEGER, \n",
+    "         cluster_score FLOAT)\n",
+    "    ROW FORMAT DELIMITED\n",
+    "      FIELDS TERMINATED BY '\\t'\n",
+    "      LINES TERMINATED BY '\\n'  \n",
+    "    LOCATION\n",
+    "        's3://{}/{}' \n",
+    "    TBLPROPERTIES (\n",
+    "        'classification'='csv', \n",
+    "        --'skip.header.line.count'='1',  \n",
+    "        'serialization.null.format'='')\n",
+    "    \"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map') \n",
+    "    athenautils.athena_start_query(q, database=config.DATABASE) \n",
+    "\n",
+    "    athenautils.write_many(cluster_ids(clustered_dupes),\n",
+    "                          filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/entity_map.csv'))\n",
+    "\n",
+    "    # Print out the number of duplicates found\n",
+    "    print('# duplicate sets')\n",
+    "\n",
+    "    # ## Payoff\n",
+    "\n",
+    "    # With all this done, we can now begin to ask interesting questions\n",
+    "    # of the data\n",
+    "    #\n",
+    "    # For example, let's see who the top 10 donors are.\n",
+    "\n",
+    "    locale.setlocale(locale.LC_ALL, 'en_CA.UTF-8')  # for pretty printing numbers\n",
+    "    \n",
+    "    athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_e_map\", database=config.DATABASE)\n",
+    "    \n",
+    "    q = \"\"\"\n",
+    "        CREATE TABLE as_e_map as \n",
+    "        SELECT COALESCE(canon_id, as_entity_map.donor_id) AS canon_id, as_entity_map.donor_id \n",
+    "        FROM as_entity_map \n",
+    "        RIGHT JOIN as_donors USING(donor_id)        \n",
+    "        \"\"\"    \n",
+    "    athenautils.athena_start_query(q, database=config.DATABASE)\n",
+    "    \n",
+    "    q = \"\"\"\n",
+    "        SELECT array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,   \n",
+    "            donation_totals.totals AS totals \n",
+    "        FROM as_donors INNER JOIN \n",
+    "            (SELECT canon_id, SUM(cast (amount as double)) AS totals \n",
+    "            FROM as_contributions INNER JOIN as_e_map \n",
+    "            USING (donor_id) \n",
+    "            GROUP BY (canon_id) \n",
+    "            ORDER BY totals \n",
+    "            DESC LIMIT 10) \n",
+    "            AS donation_totals \n",
+    "        ON as_donors.donor_id = donation_totals.canon_id\n",
+    "        ORDER BY totals DESC\n",
+    "    \"\"\"\n",
+    "    cur = dict_cursor_execute(q, database=config.DATABASE)\n",
+    "\n",
+    "    print(\"Top Donors (deduped)\")\n",
+    "    for row in cur:\n",
+    "        row['totals'] = locale.currency(row['totals'], grouping=True)\n",
+    "        print('%(totals)20s: %(name)s' % row)\n",
+    "\n",
+    "    # Compare this to what we would have gotten if we hadn't done any\n",
+    "    # deduplication\n",
+    "    q = \"\"\"\n",
+    "        with donorscontributions as(\n",
+    "\n",
+    "            SELECT as_donors.donor_id, \n",
+    "                array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,\n",
+    "                cast(as_contributions.amount as double) as amount\n",
+    "            FROM as_donors INNER JOIN as_contributions \n",
+    "                USING (donor_id) \n",
+    "            )\n",
+    "        SELECT name, sum(amount) AS totals  \n",
+    "        FROM donorscontributions\n",
+    "        GROUP BY donor_id, name\n",
+    "        ORDER BY totals DESC \n",
+    "        LIMIT 10\n",
+    "    \"\"\"\n",
+    "    cur = dict_cursor_execute(q, database=config.DATABASE)\n",
+    "\n",
+    "    print(\"Top Donors (raw)\")\n",
+    "    for row in cur:\n",
+    "        row['totals'] = locale.currency(row['totals'], grouping=True)\n",
+    "        print('%(totals)20s: %(name)s' % row)\n",
+    "\n",
+    "    print('ran in', time.time() - start_time, 'seconds')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!python ../athena_example/athena_example.py"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "conda_python3",
+   "language": "python",
+   "name": "conda_python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.10"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "state": {},
+    "version_major": 2,
+    "version_minor": 0
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/athena_init_db.ipynb b/notebooks/athena_init_db.ipynb
new file mode 100644
index 00000000..d35250de
--- /dev/null
+++ b/notebooks/athena_init_db.ipynb
@@ -0,0 +1,277 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install dedupe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile ../athena_example/config.py\n",
+    "LOG_FILE = 'log.txt'\n",
+    "\n",
+    "# Connection parameters\n",
+    "ACCESS_KEY_ID = None\n",
+    "SECRET_ACCESS_KEY = None\n",
+    "ATHENA_GARBAGE_PATH = 's3://aws-athena-query-results-rds'\n",
+    "WORKGROUP = 'RDS'\n",
+    "REGION = 'eu-west-1'\n",
+    "DATABASE = 'ria_tmp'\n",
+    "\n",
+    "# Database Parameters\n",
+    "DATABASE_BUCKET = 'ria-temp'\n",
+    "DATABASE_ROOT_KEY = 'as_dedupe/'\n",
+    "BUFFERSIZE = 100000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile ../athena_example/athena_init.py\n",
+    "#!/usr/bin/python\n",
+    "\"\"\"\n",
+    "This is a setup script for athena_example.  It downloads a zip file of\n",
+    "Illinois campaign contributions and loads them into a Athena database\n",
+    "named 'contributions'.\n",
+    " \n",
+    "__Note:__ You will need to run this script first before execuing\n",
+    "[athena_example.py](athena_example.py).\n",
+    " \n",
+    "Tables created:\n",
+    "* as_raw_table - raw import of entire CSV file\n",
+    "* donors - all distinct donors based on name and address\n",
+    "* recipients - all distinct campaign contribution recipients\n",
+    "* contributions - contribution amounts tied to donor and recipients tables\n",
+    "\"\"\"\n",
+    "\n",
+    "import os\n",
+    "import zipfile\n",
+    "import warnings\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from urllib.request import urlopen\n",
+    "import boto3\n",
+    "import config\n",
+    "import csv\n",
+    "import sys\n",
+    "sys.path.insert(0, '../athena_example/')\n",
+    "import athenautils\n",
+    "\n",
+    "\n",
+    "contributions_zip_file = 'Illinois-campaign-contributions.txt.zip'\n",
+    "contributions_txt_file = 'Illinois-campaign-contributions.txt'\n",
+    "\n",
+    "if not os.path.exists(contributions_zip_file) :\n",
+    "    print('downloading', contributions_zip_file, '(~60mb) ...')\n",
+    "    u = urlopen('https://s3.amazonaws.com/dedupe-data/Illinois-campaign-contributions.txt.zip')\n",
+    "    localFile = open(contributions_zip_file, 'wb')\n",
+    "    localFile.write(u.read())\n",
+    "    localFile.close()\n",
+    "\n",
+    "if not os.path.exists(contributions_txt_file) :\n",
+    "    zip_file = zipfile.ZipFile(contributions_zip_file, 'r')\n",
+    "    print('extracting %s' % contributions_zip_file)\n",
+    "    zip_file_contents = zip_file.namelist()\n",
+    "    for f in zip_file_contents:\n",
+    "        if ('.txt' in f):\n",
+    "            zip_file.extract(f)\n",
+    "    zip_file.close()\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "print('importing raw data from csv...')\n",
+    "athenautils.drop_external_table(\"as_raw_table\", \n",
+    "                                location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table'),\n",
+    "                                database=config.DATABASE)    \n",
+    "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_donors\", database=config.DATABASE)\n",
+    "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_recipients\", database=config.DATABASE)\n",
+    "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_contributions\", database=config.DATABASE)\n",
+    "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_processed_donors\", database=config.DATABASE)\n",
+    "\n",
+    "\n",
+    "q=r\"\"\"\n",
+    "CREATE EXTERNAL TABLE as_raw_table \n",
+    "    (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), \n",
+    "    address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), \n",
+    "    state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), \n",
+    "    date_recieved VARCHAR(10), loan_amount VARCHAR(12), \n",
+    "    amount VARCHAR(23), receipt_type VARCHAR(23), \n",
+    "    employer VARCHAR(70), occupation VARCHAR(40), \n",
+    "    vendor_last_name VARCHAR(70), vendor_first_name VARCHAR(20), \n",
+    "    vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), \n",
+    "    vendor_city VARCHAR(20), vendor_state VARCHAR(10), \n",
+    "    vendor_zip VARCHAR(10), description VARCHAR(90), \n",
+    "    election_type VARCHAR(10), election_year VARCHAR(10), \n",
+    "    report_period_begin VARCHAR(10), report_period_end VARCHAR(33), \n",
+    "    committee_name VARCHAR(70), committee_id VARCHAR(37)) \n",
+    "ROW FORMAT DELIMITED\n",
+    "  FIELDS TERMINATED BY '\\t'\n",
+    "  ESCAPED BY '\\\\'\n",
+    "  LINES TERMINATED BY '\\n'  \n",
+    "LOCATION\n",
+    "    's3://{}/{}' \n",
+    "TBLPROPERTIES (\n",
+    "    'classification'='csv', \n",
+    "    'skip.header.line.count'='1',  \n",
+    "    'serialization.null.format'='')\n",
+    "\"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table') \n",
+    "athenautils.athena_start_query(q, database=config.DATABASE)\n",
+    "\n",
+    "\n",
+    "df_cursor = pd.read_csv(contributions_txt_file, sep='\\t', escapechar='\\\\', quoting=csv.QUOTE_NONE,  \n",
+    "                        error_bad_lines=False, warn_bad_lines=True, dtype=str, keep_default_na=False, na_values=[''],\n",
+    "                        chunksize=config.BUFFERSIZE)\n",
+    "chunkcount = 0\n",
+    "filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'as_raw_table', os.path.splitext(contributions_txt_file)[0]+'.csv')\n",
+    "for df in df_cursor: \n",
+    "    # Remove the very few records that mess up the demo \n",
+    "    # (demo purposes only! Don't do something like this in production)\n",
+    "    df = df[df['RcvDate'].str.len()>=10]\n",
+    "\n",
+    "    # set empty, non-zero, strings in date columns to null\n",
+    "    df.loc[df['RptPdBegDate'].str.len()<10,'RptPdBegDate'] = np.nan\n",
+    "\n",
+    "    df.loc[df['RptPdEndDate'].str.len()<10,'RptPdEndDate'] = np.nan\n",
+    "\n",
+    "    #committee ID is requred. Remove the 2 rows that don't have it.\n",
+    "    df = df[df['ID']!='']\n",
+    "\n",
+    "    # There's a record with a date stuck in the committee_id column, which causes\n",
+    "    # problems when inserting into the contributions table below. Get rid of it this \n",
+    "    # way.\n",
+    "    df = df[df['ID'].str.len() <=9]\n",
+    "\n",
+    "    # dropping the last columns\n",
+    "    df = df.drop(columns='Unnamed: 29')\n",
+    "\n",
+    "    df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand')\n",
+    "    \n",
+    "    buffer = df_lower.to_csv(quoting=csv.QUOTE_NONE, sep=\"\\t\", escapechar='\\\\', index=None)\n",
+    "    \n",
+    "    chunk_fname = athenautils.file_name_append(filename, '_{}'.format(chunkcount), ommitext=False)\n",
+    "    athenautils.write(body=buffer, filename=chunk_fname)\n",
+    "    chunkcount += 1    \n",
+    "    \n",
+    "print('creating donors table...')\n",
+    "q=\"\"\"\n",
+    "CREATE TABLE as_donors as\n",
+    "    with tmp as\n",
+    "      (SELECT DISTINCT \n",
+    "           NULLIF(TRIM(last_name), '') as last_name, \n",
+    "           NULLIF(TRIM(first_name), '') as first_name, \n",
+    "           NULLIF(TRIM(address_1), '') as address_1, \n",
+    "           NULLIF(TRIM(address_2), '') as address_2, \n",
+    "           NULLIF(TRIM(city), '') city, \n",
+    "           NULLIF(TRIM(state), '') as state, \n",
+    "           NULLIF(TRIM(zip), '') as zip, \n",
+    "           NULLIF(TRIM(employer), '') as employer, \n",
+    "           NULLIF(TRIM(occupation), '') as occupation\n",
+    "      FROM as_raw_table)\n",
+    "    SELECT row_number() over () as donor_id, * from tmp\"\"\"\n",
+    "athenautils.athena_start_query(q, database=config.DATABASE)\n",
+    "\n",
+    "\n",
+    "q=\"\"\"\n",
+    "CREATE TABLE as_recipients as\n",
+    "    SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM as_raw_table\n",
+    "\"\"\"\n",
+    "athenautils.athena_start_query(q, database=config.DATABASE)\n",
+    "\n",
+    "print('creating contributions table')\n",
+    "\n",
+    "q=\"\"\"\n",
+    "CREATE TABLE as_contributions as\n",
+    "    SELECT reciept_id as contribution_id, \n",
+    "        donors.donor_id as donor_id , \n",
+    "        committee_id as recipient_id, \n",
+    "        report_type, date_parse(date_recieved, '%m/%d/%Y') as date_recieved, \n",
+    "        loan_amount, amount, \n",
+    "        receipt_type, vendor_last_name , \n",
+    "        vendor_first_name, vendor_address_1, vendor_address_2, \n",
+    "        vendor_city, vendor_state, vendor_zip, description, \n",
+    "        election_type, election_year, \n",
+    "        date_parse(report_period_begin, '%m/%d/%Y') as report_period_begin, \n",
+    "        date_parse(report_period_end, '%m/%d/%Y') as report_period_end \n",
+    "    FROM as_raw_table JOIN as_donors donors ON \n",
+    "        coalesce(donors.first_name, '') = coalesce(TRIM(as_raw_table.first_name), '') AND \n",
+    "        coalesce(donors.last_name, '') = coalesce(TRIM(as_raw_table.last_name), '') AND \n",
+    "        coalesce(donors.address_1, '') = coalesce(TRIM(as_raw_table.address_1), '') AND \n",
+    "        coalesce(donors.address_2, '') = coalesce(TRIM(as_raw_table.address_2), '') AND \n",
+    "        coalesce(donors.city, '') = coalesce(TRIM(as_raw_table.city), '') AND \n",
+    "        coalesce(donors.state, '') = coalesce(TRIM(as_raw_table.state), '') AND \n",
+    "        coalesce(donors.employer, '') = coalesce(TRIM(as_raw_table.employer), '') AND \n",
+    "        coalesce(donors.occupation , '')= coalesce(TRIM(as_raw_table.occupation), '') AND \n",
+    "        coalesce(donors.zip, '') = coalesce(TRIM(as_raw_table.zip), '')\"\"\"\n",
+    "\n",
+    "athenautils.athena_start_query(q, database=config.DATABASE)\n",
+    "\n",
+    "q = \"\"\"\n",
+    "CREATE TABLE as_processed_donors AS  \n",
+    "    SELECT donor_id,  \n",
+    "     LOWER(city) AS city,  \n",
+    "     CASE WHEN (first_name IS NULL AND last_name IS NULL) \n",
+    "          THEN NULL \n",
+    "          ELSE LOWER(array_join(filter(array[first_name, last_name], x-> x IS NOT NULL), ' ')) \n",
+    "     END AS name,  \n",
+    "     LOWER(zip) AS zip,  \n",
+    "     LOWER(state) AS state,  \n",
+    "     CASE WHEN (address_1 IS NULL AND address_2 IS NULL) \n",
+    "          THEN NULL \n",
+    "          ELSE LOWER(array_join(filter(array[address_1, address_2], x-> x IS NOT NULL), ' '))\n",
+    "     END AS address,  \n",
+    "     LOWER(occupation) AS occupation, \n",
+    "     LOWER(employer) AS employer, \n",
+    "     first_name is null AS person \n",
+    " FROM as_donors\"\"\"\n",
+    "athenautils.athena_start_query(q, database=config.DATABASE)\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "print('done')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!python ../athena_example/athena_init.py"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "conda_python3",
+   "language": "python",
+   "name": "conda_python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From b7eedfcbbe591641d043444ceb4c737281d192ac Mon Sep 17 00:00:00 2001
From: EC2 Default User <ec2-user@ip-10-10-21-220.eu-west-1.compute.internal>
Date: Thu, 11 Feb 2021 23:10:35 +0000
Subject: [PATCH 14/19] modifying pandas_read_csv

---
 athena_example/utils.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/athena_example/utils.py b/athena_example/utils.py
index 1b8b935a..80922548 100644
--- a/athena_example/utils.py
+++ b/athena_example/utils.py
@@ -89,18 +89,17 @@ def list_all(path):
     
 
 def pandas_read_csv(filepath_or_buffer, verbose=True, **kwargs):
-    bucket, key = seperate_bucket_key(filepath_or_buffer)
-    obj = s3.get_object(Bucket=bucket, Key=key)
-    return pd.read_csv(SomethingIO(obj['Body'].read()),  **kwargs)
+    return pd.read_csv(filepath_or_buffer, **kwargs)
 
-def read(filename, verbose=True):
+def reader(filename, Range='string', verbose=True):
+    '''
+        Range: look at: https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
+    '''
     log ("Reading {}".format(filename), verbose=verbose)
     if is_s3_url(filename):
         bucket, key = seperate_bucket_key(filename)
-        obj=s3.get_object(Bucket=bucket, Key=key)
-        return obj['Body'].read()
-    with open (filename) as f:
-        return f.read()
+        obj=s3.get_object(Bucket=bucket, Key=key, Range=Range)
+        return obj['Body']
 
 def write(body, filename):
     bucket, key = seperate_bucket_key(filename)

From e30e10764b07ea80085fce4310df7f86354060bc Mon Sep 17 00:00:00 2001
From: asajadi <asajadi@gmail.com>
Date: Sat, 8 May 2021 16:53:53 -0400
Subject: [PATCH 15/19] updating config file, removing notebooks

---
 README.md                      |  68 ++++--
 athena_example/README.md       |   5 +-
 athena_example/config.py       |  12 +-
 notebooks/athena_example.ipynb | 418 ---------------------------------
 notebooks/athena_init_db.ipynb | 277 ----------------------
 5 files changed, 61 insertions(+), 719 deletions(-)
 delete mode 100644 notebooks/athena_example.ipynb
 delete mode 100644 notebooks/athena_init_db.ipynb

diff --git a/README.md b/README.md
index 82abad3d..073b0ede 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,12 @@
 # Dedupe Examples
 
-Adding Athena Example scripts for the [dedupe](https://github.com/dedupeio/dedupe), a library that uses machine learning to perform de-duplication and entity resolution quickly on structured data.
+Example scripts for the [dedupe](https://github.com/dedupeio/dedupe), a library that uses machine learning to perform de-duplication and entity resolution quickly on structured data.
 
 Part of the [Dedupe.io](https://dedupe.io/) cloud service and open source toolset for de-duplicating and finding fuzzy matches in your data. For more details, see the [differences between Dedupe.io and the dedupe library](https://dedupe.io/documentation/should-i-use-dedupeio-or-the-dedupe-python-library.html).
 
-To get the athena examples:
+To get these examples:
 ```bash
-git clone https://github.com/asajadi/dedupe-examples.git
+git clone https://github.com/dedupeio/dedupe-examples.git
 cd dedupe-examples
 ```
 
@@ -34,28 +34,66 @@ Afterwards, whenever you want to work on dedupe-examples,
 workon dedupe-examples
 ```
 
+### [CSV example](https://dedupeio.github.io/dedupe-examples/docs/csv_example.html) - early childhood locations
 
-### [athena example](https://dedupeio.github.io/dedupe-examples/docs/mysql_example.html) - IL campaign contributions
+This example works with a list of early childhood education sites in Chicago from 10 different sources.
 
-Takes a database of IL campaign contribution data, loads it in to a
-Athena database, and identifies the unique donors. 
+```bash
+cd csv_example
+pip install unidecode
+python csv_example.py
+```
+  (use 'y', 'n' and 'u' keys to flag duplicates for active learning, 'f' when you are finished)
+
+**To see how you might use dedupe with smallish data, see the [annotated source code for csv_example.py](https://dedupeio.github.io/dedupe-examples/docs/csv_example.html).**
 
-To follow this example you need to 
+### [Patent example](https://dedupeio.github.io/dedupe-examples/docs/patent_example.html) -  patent holders
 
-* Create a Athena database called 'contributions'
-* Update `athena_example/config.py` with your Athena credentials
-* Install dependencies, `pip install -r requirements.txt`
+This example works with Dutch inventors from the PATSTAT international patent data file
+
+```bash
+cd patent_example
+pip install unidecode
+python patent_example.py
+```
+  (use 'y', 'n' and 'u' keys to flag duplicates for active learning, 'f' when you are finished)
 
-Once that's all done you can run the example:
+### [Record Linkage example](https://dedupeio.github.io/dedupe-examples/docs/record_linkage_example.html) -  electronics products
+This example links two spreadsheets of electronics products and links up the matching entries. Each dataset individually has no duplicates.
 
 ```bash
-cd mysql_example
-python athena_init_db.py 
-python athena_example.py
+cd record_linkage_example
+python record_linkage_example.py
 ```
 
-  (use 'y', 'n' and 'u' keys to flag duplicates for active learning, 'f' when you are finished) 
+**To see how you might use dedupe for linking datasets, see the [annotated source code for record_linkage_example.py](https://dedupeio.github.io/dedupe-examples/docs/record_linkage_example.html).**
+
+### [Gazetteer example](https://dedupeio.github.io/dedupe-examples/docs/gazetteer_example.html) -  electronics products
+This example links two spreadsheets of electronics products and links up the matching entries using the Gazetteer class
+
+```bash
+cd gazetteer_example.py
+python gazetteer_example.py
+```
+
+
+### [MySQL example](https://dedupeio.github.io/dedupe-examples/docs/mysql_example.html) - IL campaign contributions
+
+See `mysql_example/README.md` for details
+
+**To see how you might use dedupe with bigish data, see the [annotated source code for mysql_example](https://dedupeio.github.io/dedupe-examples/docs/mysql_example.html).**
+
+
+### [PostgreSQL big dedupe example](https://dedupeio.github.io/dedupe-examples/docs/pgsql_big_dedupe_example.html) - PostgreSQL example on large dataset
+
+See `pgsql_big_dedupe_example/README.md` for details
+
+This is the same example as the MySQL IL campaign contributions dataset above, but ported to run on PostgreSQL.
+
+### Athena example - IL campaign contributions
+See `athena_example/README.md` for details
 
+This is the same example as the MySQL IL campaign contributions dataset above, but ported to run on Athena.
 
 
 
diff --git a/athena_example/README.md b/athena_example/README.md
index 53442a12..7b481322 100644
--- a/athena_example/README.md
+++ b/athena_example/README.md
@@ -5,14 +5,13 @@ Athena database, and identifies the unique donors.
 
 To follow this example you need to 
 
-* Create a Athena database called 'contributions'
-* Update `athena_example/config.py` with your Athena credentials
+* Update `athena_example/config.py` with your Athena credentials, database name and the path to sroe the data
 * Install dependencies, `pip install -r requirements.txt`
 
 Once that's all done you can run the example:
 
 ```bash
-cd mysql_example
+cd athena_example
 python athena_init.py 
 python athena_example.py
 ```
diff --git a/athena_example/config.py b/athena_example/config.py
index 9808a709..b37da68c 100644
--- a/athena_example/config.py
+++ b/athena_example/config.py
@@ -3,12 +3,12 @@
 # Connection parameters
 ACCESS_KEY_ID = None
 SECRET_ACCESS_KEY = None
-ATHENA_GARBAGE_PATH = 's3://aws-athena-query-results-rds'
-WORKGROUP = 'RDS'
-REGION = 'eu-west-1'
-DATABASE = 'ria_tmp'
+ATHENA_GARBAGE_PATH = '<ATHENA_OUTPUT_LOCATION>'
+WORKGROUP = '<WORKGROUP>'
+REGION = '<REGION>'
+DATABASE = '<DATABASE>'
 
 # Database Parameters
-DATABASE_BUCKET = 'ria-temp'
-DATABASE_ROOT_KEY = 'as_dedupe/'
+DATABASE_BUCKET = '<DATABASE_BUCKET>'
+DATABASE_ROOT_KEY = 'dedupe/'
 BUFFERSIZE = 100000
diff --git a/notebooks/athena_example.ipynb b/notebooks/athena_example.ipynb
deleted file mode 100644
index 089fb9e0..00000000
--- a/notebooks/athena_example.ipynb
+++ /dev/null
@@ -1,418 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%writefile ../athena_example/athena_example.py\n",
-    "\n",
-    "\"\"\"\n",
-    "This is an example of working with very large data. There are about\n",
-    "700,000 unduplicated donors in this database of Illinois political\n",
-    "campaign contributions.\n",
-    "\n",
-    "With such a large set of input data, we cannot store all the comparisons\n",
-    "we need to make in memory. Instead, we will read the pairs on demand\n",
-    "from the Athena database.\n",
-    "\n",
-    "__Note:__ You will need to run `python athena_init_db.py`\n",
-    "before running this script. See the annotates source for\n",
-    "[athena_init_db.py](athena_init_db.html)\n",
-    "\n",
-    "For smaller datasets (<10,000), see our\n",
-    "[csv_example](csv_example.html)\n",
-    "\"\"\"\n",
-    "\n",
-    "# There is a little bit difference between the result \n",
-    "# of this module and the athena one. The reason is due to\n",
-    "# Some special (and mostly erroneous) characters, such as \\a .. \n",
-    "# Which are dealt with differently by athena and athena/panda\n",
-    "\n",
-    "import sys\n",
-    "import os\n",
-    "import itertools\n",
-    "import time\n",
-    "import logging\n",
-    "import optparse\n",
-    "import locale\n",
-    "import json\n",
-    "from io import StringIO\n",
-    "import csv\n",
-    "import pandas as pd\n",
-    "\n",
-    "import boto3\n",
-    "import dedupe\n",
-    "import dedupe.backport\n",
-    "sys.path.insert(0, '../athena_example/')\n",
-    "import config\n",
-    "sys.path.insert(0, '../athena_example/')\n",
-    "import athenautils\n",
-    "\n",
-    "def cursor_execute(query, database):\n",
-    "    '''\n",
-    "    The MySQL compatible Cursor\n",
-    "    '''\n",
-    "    return athenautils.cursor_execute(query, database=database, \n",
-    "                                      cursortype='tuple', buffersize=config.BUFFERSIZE,\n",
-    "                                      escapechar=None, keep_default_na=False, na_values=[''])\n",
-    "\n",
-    "def dict_cursor_execute(query, database):\n",
-    "    '''\n",
-    "    The MySQL compatible DicCursor\n",
-    "    '''\n",
-    "    return athenautils.cursor_execute(query, database=database, \n",
-    "                                      cursortype='dict', buffersize=config.BUFFERSIZE,\n",
-    "                                      escapechar=None, keep_default_na=False, na_values=[''])\n",
-    "def record_pairs(result_set):\n",
-    "    for i, row in enumerate(result_set):\n",
-    "        a_record_id, a_record, b_record_id, b_record = row\n",
-    "        record_a = (a_record_id, json.loads(a_record))\n",
-    "        record_b = (b_record_id, json.loads(b_record))\n",
-    "\n",
-    "        yield record_a, record_b\n",
-    "\n",
-    "        if i % 10000 == 0:\n",
-    "            print(i)\n",
-    "\n",
-    "\n",
-    "def cluster_ids(clustered_dupes):\n",
-    "\n",
-    "    for cluster, scores in clustered_dupes:\n",
-    "        cluster_id = cluster[0]\n",
-    "        for donor_id, score in zip(cluster, scores):\n",
-    "            yield donor_id, cluster_id, score\n",
-    "\n",
-    "\n",
-    "if __name__ == '__main__':\n",
-    "\n",
-    "    ## Logging\n",
-    "\n",
-    "    # Dedupe uses Python logging to show or suppress verbose output. Added\n",
-    "    # for convenience.  To enable verbose output, run `python\n",
-    "    # examples/athena_example/athena_example.py -v`\n",
-    "    \n",
-    "    optp = optparse.OptionParser()\n",
-    "    optp.add_option('-v', '--verbose', dest='verbose', action='count',\n",
-    "                    help='Increase verbosity (specify multiple times for more)'\n",
-    "                    )\n",
-    "    (opts, args) = optp.parse_args()\n",
-    "    log_level = logging.WARNING\n",
-    "    if opts.verbose:\n",
-    "        if opts.verbose == 1:\n",
-    "            log_level = logging.INFO\n",
-    "        elif opts.verbose >= 2:\n",
-    "            log_level = logging.DEBUG\n",
-    "\n",
-    "\n",
-    "    logging.getLogger().setLevel(log_level)\n",
-    "\n",
-    "    \n",
-    "\n",
-    "\n",
-    "    settings_file = 'athena_example_settings'\n",
-    "    training_file = 'athena_example_training.json'\n",
-    "\n",
-    "    start_time = time.time()\n",
-    "\n",
-    "    # We'll be using variations on this following select statement to pull\n",
-    "    # in campaign donor info.\n",
-    "    #\n",
-    "    # We did a fair amount of preprocessing of the fields in\n",
-    "    # `athena_init_db.py`    \n",
-    "    DONOR_SELECT = \"\"\"SELECT donor_id, city, name, zip, state, address\n",
-    "                      from as_processed_donors\"\"\"\n",
-    "\n",
-    "    # ## Training\n",
-    "\n",
-    "    if os.path.exists(settings_file):\n",
-    "        print('reading from ', settings_file)\n",
-    "        with open(settings_file, 'rb') as sf:\n",
-    "            deduper = dedupe.StaticDedupe(sf, num_cores=4)\n",
-    "    else:\n",
-    "        # Define the fields dedupe will pay attention to\n",
-    "        #\n",
-    "        # The address, city, and zip fields are often missing, so we'll\n",
-    "        # tell dedupe that, and we'll learn a model that take that into\n",
-    "        # account\n",
-    "        fields = [{'field': 'name', 'type': 'String'},\n",
-    "                  {'field': 'address', 'type': 'String',\n",
-    "                   'has missing': True},\n",
-    "                  {'field': 'city', 'type': 'ShortString', 'has missing': True},\n",
-    "                  {'field': 'state', 'type': 'ShortString', 'has missing': True},\n",
-    "                  {'field': 'zip', 'type': 'ShortString', 'has missing': True},\n",
-    "                  ]\n",
-    "\n",
-    "        # Create a new deduper object and pass our data model to it.\n",
-    "        deduper = dedupe.Dedupe(fields, num_cores=4)\n",
-    "\n",
-    "        # We will sample pairs from the entire donor table for training\n",
-    "        cur = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE)\n",
-    "        temp_d = {i: row for i, row in enumerate(cur)}\n",
-    "            \n",
-    "\n",
-    "        # If we have training data saved from a previous run of dedupe,\n",
-    "        # look for it an load it in.\n",
-    "        #\n",
-    "        # __Note:__ if you want to train from\n",
-    "        # scratch, delete the training_file\n",
-    "        if os.path.exists(training_file):\n",
-    "            print('reading labeled examples from ', training_file)\n",
-    "            with open(training_file) as tf:\n",
-    "                deduper.prepare_training(temp_d, training_file=tf)\n",
-    "        else:\n",
-    "            deduper.prepare_training(temp_d)\n",
-    "\n",
-    "        del temp_d\n",
-    "\n",
-    "        # ## Active learning\n",
-    "\n",
-    "        print('starting active labeling...')\n",
-    "        # Starts the training loop. Dedupe will find the next pair of records\n",
-    "        # it is least certain about and ask you to label them as duplicates\n",
-    "        # or not.\n",
-    "\n",
-    "        # use 'y', 'n' and 'u' keys to flag duplicates\n",
-    "        # press 'f' when you are finished\n",
-    "        dedupe.convenience.console_label(deduper)\n",
-    "        # When finished, save our labeled, training pairs to disk\n",
-    "        with open(training_file, 'w') as tf:\n",
-    "            deduper.write_training(tf)\n",
-    "\n",
-    "        # Notice our the argument here\n",
-    "        #\n",
-    "        # `recall` is the proportion of true dupes pairs that the learned\n",
-    "        # rules must cover. You may want to reduce this if your are making\n",
-    "        # too many blocks and too many comparisons.\n",
-    "        deduper.train(recall=0.90)\n",
-    "\n",
-    "        with open(settings_file, 'wb') as sf:\n",
-    "            deduper.write_settings(sf)\n",
-    "\n",
-    "        # We can now remove some of the memory hobbing objects we used\n",
-    "        # for training\n",
-    "        deduper.cleanup_training()\n",
-    "\n",
-    "    # ## Blocking\n",
-    "\n",
-    "    print('blocking...')\n",
-    "\n",
-    "    # To run blocking on such a large set of data, we create a separate table\n",
-    "    # that contains blocking keys and record ids\n",
-    "    print('creating as_blocking_map database')\n",
-    "    athenautils.drop_external_table(\"as_blocking_map\", \n",
-    "                                    location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map'),\n",
-    "                                    database=config.DATABASE)\n",
-    "\n",
-    "    q=\"\"\"\n",
-    "    CREATE EXTERNAL TABLE as_blocking_map     \n",
-    "        (block_key VARCHAR(200), donor_id INTEGER)\n",
-    "    ROW FORMAT DELIMITED\n",
-    "      FIELDS TERMINATED BY '\\t'\n",
-    "      LINES TERMINATED BY '\\n'  \n",
-    "    LOCATION\n",
-    "        's3://{}/{}' \n",
-    "    TBLPROPERTIES (\n",
-    "        'classification'='csv', \n",
-    "        --'skip.header.line.count'='1',  \n",
-    "        'serialization.null.format'='')\n",
-    "    \"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map') \n",
-    "    athenautils.athena_start_query(q, database=config.DATABASE)\n",
-    "\n",
-    "    # If dedupe learned a Index Predicate, we have to take a pass\n",
-    "    # through the data and create indices.\n",
-    "    print('creating inverted index')\n",
-    "\n",
-    "    # Armin: \n",
-    "    # This never runs, index_fields is empty, possible bug?\n",
-    "    for field in deduper.fingerprinter.index_fields:\n",
-    "        q = \"\"\"\n",
-    "        SELECT DISTINCT {field} FROM as_processed_donors\n",
-    "        WHERE {field} IS NOT NULL\n",
-    "        \"\"\".format(field=field)\n",
-    "        cur = dict_cursor_execute(q, databse=config.DATABASE)\n",
-    "        field_data = (row[field] for row in cur)\n",
-    "        deduper.fingerprinter.index(field_data, field)\n",
-    "     \n",
-    "\n",
-    "    # Now we are ready to write our blocking map table by creating a\n",
-    "    # generator that yields unique `(block_key, donor_id)` tuples.\n",
-    "    print('writing blocking map')\n",
-    "    \n",
-    "    read_cur  = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE)\n",
-    "    full_data = ((row['donor_id'], row) for row in read_cur)\n",
-    "\n",
-    "    b_data = deduper.fingerprinter(full_data)\n",
-    "    athenautils.write_many(b_data, \n",
-    "                           filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map/blocking.csv'))\n",
-    "\n",
-    "\n",
-    "    # select unique pairs to compare\n",
-    "    q=\"\"\"\n",
-    "        SELECT a.donor_id,\n",
-    "            json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'],\n",
-    "                                  ARRAY[ a.city, a.name, a.zip, a.state, a.address])\n",
-    "                        AS JSON)),\n",
-    "            b.donor_id,\n",
-    "            json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], \n",
-    "                      ARRAY[ b.city, b.name, b.zip, b.state, b.address])\n",
-    "                  AS JSON))\n",
-    "        FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west\n",
-    "             from as_blocking_map as l\n",
-    "             INNER JOIN as_blocking_map as r\n",
-    "             using (block_key)\n",
-    "             where l.donor_id < r.donor_id) ids\n",
-    "        INNER JOIN as_processed_donors a on ids.east=a.donor_id\n",
-    "        INNER JOIN as_processed_donors b on ids.west=b.donor_id\n",
-    "       \"\"\"\n",
-    "    read_cur = cursor_execute(q, database=config.DATABASE)\n",
-    "\n",
-    "\n",
-    "    # ## Clustering\n",
-    "\n",
-    "    print('clustering...')\n",
-    "    clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)),\n",
-    "                                      threshold=0.5)\n",
-    "\n",
-    "#     athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_entity_map\", database=config.DATABASE)\n",
-    "    athenautils.drop_external_table(\"as_entity_map\", \n",
-    "                                    location='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/'), \n",
-    "                                    database=config.DATABASE)\n",
-    "    \n",
-    "    print('creating as_entity_map database')\n",
-    "    q=\"\"\"\n",
-    "    CREATE EXTERNAL TABLE as_entity_map     \n",
-    "        (donor_id INTEGER, canon_id INTEGER, \n",
-    "         cluster_score FLOAT)\n",
-    "    ROW FORMAT DELIMITED\n",
-    "      FIELDS TERMINATED BY '\\t'\n",
-    "      LINES TERMINATED BY '\\n'  \n",
-    "    LOCATION\n",
-    "        's3://{}/{}' \n",
-    "    TBLPROPERTIES (\n",
-    "        'classification'='csv', \n",
-    "        --'skip.header.line.count'='1',  \n",
-    "        'serialization.null.format'='')\n",
-    "    \"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map') \n",
-    "    athenautils.athena_start_query(q, database=config.DATABASE) \n",
-    "\n",
-    "    athenautils.write_many(cluster_ids(clustered_dupes),\n",
-    "                          filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/entity_map.csv'))\n",
-    "\n",
-    "    # Print out the number of duplicates found\n",
-    "    print('# duplicate sets')\n",
-    "\n",
-    "    # ## Payoff\n",
-    "\n",
-    "    # With all this done, we can now begin to ask interesting questions\n",
-    "    # of the data\n",
-    "    #\n",
-    "    # For example, let's see who the top 10 donors are.\n",
-    "\n",
-    "    locale.setlocale(locale.LC_ALL, 'en_CA.UTF-8')  # for pretty printing numbers\n",
-    "    \n",
-    "    athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_e_map\", database=config.DATABASE)\n",
-    "    \n",
-    "    q = \"\"\"\n",
-    "        CREATE TABLE as_e_map as \n",
-    "        SELECT COALESCE(canon_id, as_entity_map.donor_id) AS canon_id, as_entity_map.donor_id \n",
-    "        FROM as_entity_map \n",
-    "        RIGHT JOIN as_donors USING(donor_id)        \n",
-    "        \"\"\"    \n",
-    "    athenautils.athena_start_query(q, database=config.DATABASE)\n",
-    "    \n",
-    "    q = \"\"\"\n",
-    "        SELECT array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,   \n",
-    "            donation_totals.totals AS totals \n",
-    "        FROM as_donors INNER JOIN \n",
-    "            (SELECT canon_id, SUM(cast (amount as double)) AS totals \n",
-    "            FROM as_contributions INNER JOIN as_e_map \n",
-    "            USING (donor_id) \n",
-    "            GROUP BY (canon_id) \n",
-    "            ORDER BY totals \n",
-    "            DESC LIMIT 10) \n",
-    "            AS donation_totals \n",
-    "        ON as_donors.donor_id = donation_totals.canon_id\n",
-    "        ORDER BY totals DESC\n",
-    "    \"\"\"\n",
-    "    cur = dict_cursor_execute(q, database=config.DATABASE)\n",
-    "\n",
-    "    print(\"Top Donors (deduped)\")\n",
-    "    for row in cur:\n",
-    "        row['totals'] = locale.currency(row['totals'], grouping=True)\n",
-    "        print('%(totals)20s: %(name)s' % row)\n",
-    "\n",
-    "    # Compare this to what we would have gotten if we hadn't done any\n",
-    "    # deduplication\n",
-    "    q = \"\"\"\n",
-    "        with donorscontributions as(\n",
-    "\n",
-    "            SELECT as_donors.donor_id, \n",
-    "                array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,\n",
-    "                cast(as_contributions.amount as double) as amount\n",
-    "            FROM as_donors INNER JOIN as_contributions \n",
-    "                USING (donor_id) \n",
-    "            )\n",
-    "        SELECT name, sum(amount) AS totals  \n",
-    "        FROM donorscontributions\n",
-    "        GROUP BY donor_id, name\n",
-    "        ORDER BY totals DESC \n",
-    "        LIMIT 10\n",
-    "    \"\"\"\n",
-    "    cur = dict_cursor_execute(q, database=config.DATABASE)\n",
-    "\n",
-    "    print(\"Top Donors (raw)\")\n",
-    "    for row in cur:\n",
-    "        row['totals'] = locale.currency(row['totals'], grouping=True)\n",
-    "        print('%(totals)20s: %(name)s' % row)\n",
-    "\n",
-    "    print('ran in', time.time() - start_time, 'seconds')\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!python ../athena_example/athena_example.py"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "conda_python3",
-   "language": "python",
-   "name": "conda_python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.10"
-  },
-  "widgets": {
-   "application/vnd.jupyter.widget-state+json": {
-    "state": {},
-    "version_major": 2,
-    "version_minor": 0
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/notebooks/athena_init_db.ipynb b/notebooks/athena_init_db.ipynb
deleted file mode 100644
index d35250de..00000000
--- a/notebooks/athena_init_db.ipynb
+++ /dev/null
@@ -1,277 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# !pip install dedupe"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%writefile ../athena_example/config.py\n",
-    "LOG_FILE = 'log.txt'\n",
-    "\n",
-    "# Connection parameters\n",
-    "ACCESS_KEY_ID = None\n",
-    "SECRET_ACCESS_KEY = None\n",
-    "ATHENA_GARBAGE_PATH = 's3://aws-athena-query-results-rds'\n",
-    "WORKGROUP = 'RDS'\n",
-    "REGION = 'eu-west-1'\n",
-    "DATABASE = 'ria_tmp'\n",
-    "\n",
-    "# Database Parameters\n",
-    "DATABASE_BUCKET = 'ria-temp'\n",
-    "DATABASE_ROOT_KEY = 'as_dedupe/'\n",
-    "BUFFERSIZE = 100000"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%writefile ../athena_example/athena_init.py\n",
-    "#!/usr/bin/python\n",
-    "\"\"\"\n",
-    "This is a setup script for athena_example.  It downloads a zip file of\n",
-    "Illinois campaign contributions and loads them into a Athena database\n",
-    "named 'contributions'.\n",
-    " \n",
-    "__Note:__ You will need to run this script first before execuing\n",
-    "[athena_example.py](athena_example.py).\n",
-    " \n",
-    "Tables created:\n",
-    "* as_raw_table - raw import of entire CSV file\n",
-    "* donors - all distinct donors based on name and address\n",
-    "* recipients - all distinct campaign contribution recipients\n",
-    "* contributions - contribution amounts tied to donor and recipients tables\n",
-    "\"\"\"\n",
-    "\n",
-    "import os\n",
-    "import zipfile\n",
-    "import warnings\n",
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "from urllib.request import urlopen\n",
-    "import boto3\n",
-    "import config\n",
-    "import csv\n",
-    "import sys\n",
-    "sys.path.insert(0, '../athena_example/')\n",
-    "import athenautils\n",
-    "\n",
-    "\n",
-    "contributions_zip_file = 'Illinois-campaign-contributions.txt.zip'\n",
-    "contributions_txt_file = 'Illinois-campaign-contributions.txt'\n",
-    "\n",
-    "if not os.path.exists(contributions_zip_file) :\n",
-    "    print('downloading', contributions_zip_file, '(~60mb) ...')\n",
-    "    u = urlopen('https://s3.amazonaws.com/dedupe-data/Illinois-campaign-contributions.txt.zip')\n",
-    "    localFile = open(contributions_zip_file, 'wb')\n",
-    "    localFile.write(u.read())\n",
-    "    localFile.close()\n",
-    "\n",
-    "if not os.path.exists(contributions_txt_file) :\n",
-    "    zip_file = zipfile.ZipFile(contributions_zip_file, 'r')\n",
-    "    print('extracting %s' % contributions_zip_file)\n",
-    "    zip_file_contents = zip_file.namelist()\n",
-    "    for f in zip_file_contents:\n",
-    "        if ('.txt' in f):\n",
-    "            zip_file.extract(f)\n",
-    "    zip_file.close()\n",
-    "\n",
-    "\n",
-    "\n",
-    "\n",
-    "print('importing raw data from csv...')\n",
-    "athenautils.drop_external_table(\"as_raw_table\", \n",
-    "                                location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table'),\n",
-    "                                database=config.DATABASE)    \n",
-    "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_donors\", database=config.DATABASE)\n",
-    "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_recipients\", database=config.DATABASE)\n",
-    "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_contributions\", database=config.DATABASE)\n",
-    "athenautils.athena_start_query(\"DROP TABLE IF EXISTS as_processed_donors\", database=config.DATABASE)\n",
-    "\n",
-    "\n",
-    "q=r\"\"\"\n",
-    "CREATE EXTERNAL TABLE as_raw_table \n",
-    "    (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), \n",
-    "    address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), \n",
-    "    state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), \n",
-    "    date_recieved VARCHAR(10), loan_amount VARCHAR(12), \n",
-    "    amount VARCHAR(23), receipt_type VARCHAR(23), \n",
-    "    employer VARCHAR(70), occupation VARCHAR(40), \n",
-    "    vendor_last_name VARCHAR(70), vendor_first_name VARCHAR(20), \n",
-    "    vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), \n",
-    "    vendor_city VARCHAR(20), vendor_state VARCHAR(10), \n",
-    "    vendor_zip VARCHAR(10), description VARCHAR(90), \n",
-    "    election_type VARCHAR(10), election_year VARCHAR(10), \n",
-    "    report_period_begin VARCHAR(10), report_period_end VARCHAR(33), \n",
-    "    committee_name VARCHAR(70), committee_id VARCHAR(37)) \n",
-    "ROW FORMAT DELIMITED\n",
-    "  FIELDS TERMINATED BY '\\t'\n",
-    "  ESCAPED BY '\\\\'\n",
-    "  LINES TERMINATED BY '\\n'  \n",
-    "LOCATION\n",
-    "    's3://{}/{}' \n",
-    "TBLPROPERTIES (\n",
-    "    'classification'='csv', \n",
-    "    'skip.header.line.count'='1',  \n",
-    "    'serialization.null.format'='')\n",
-    "\"\"\".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table') \n",
-    "athenautils.athena_start_query(q, database=config.DATABASE)\n",
-    "\n",
-    "\n",
-    "df_cursor = pd.read_csv(contributions_txt_file, sep='\\t', escapechar='\\\\', quoting=csv.QUOTE_NONE,  \n",
-    "                        error_bad_lines=False, warn_bad_lines=True, dtype=str, keep_default_na=False, na_values=[''],\n",
-    "                        chunksize=config.BUFFERSIZE)\n",
-    "chunkcount = 0\n",
-    "filename=os.path.join(\"s3://\", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'as_raw_table', os.path.splitext(contributions_txt_file)[0]+'.csv')\n",
-    "for df in df_cursor: \n",
-    "    # Remove the very few records that mess up the demo \n",
-    "    # (demo purposes only! Don't do something like this in production)\n",
-    "    df = df[df['RcvDate'].str.len()>=10]\n",
-    "\n",
-    "    # set empty, non-zero, strings in date columns to null\n",
-    "    df.loc[df['RptPdBegDate'].str.len()<10,'RptPdBegDate'] = np.nan\n",
-    "\n",
-    "    df.loc[df['RptPdEndDate'].str.len()<10,'RptPdEndDate'] = np.nan\n",
-    "\n",
-    "    #committee ID is requred. Remove the 2 rows that don't have it.\n",
-    "    df = df[df['ID']!='']\n",
-    "\n",
-    "    # There's a record with a date stuck in the committee_id column, which causes\n",
-    "    # problems when inserting into the contributions table below. Get rid of it this \n",
-    "    # way.\n",
-    "    df = df[df['ID'].str.len() <=9]\n",
-    "\n",
-    "    # dropping the last columns\n",
-    "    df = df.drop(columns='Unnamed: 29')\n",
-    "\n",
-    "    df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand')\n",
-    "    \n",
-    "    buffer = df_lower.to_csv(quoting=csv.QUOTE_NONE, sep=\"\\t\", escapechar='\\\\', index=None)\n",
-    "    \n",
-    "    chunk_fname = athenautils.file_name_append(filename, '_{}'.format(chunkcount), ommitext=False)\n",
-    "    athenautils.write(body=buffer, filename=chunk_fname)\n",
-    "    chunkcount += 1    \n",
-    "    \n",
-    "print('creating donors table...')\n",
-    "q=\"\"\"\n",
-    "CREATE TABLE as_donors as\n",
-    "    with tmp as\n",
-    "      (SELECT DISTINCT \n",
-    "           NULLIF(TRIM(last_name), '') as last_name, \n",
-    "           NULLIF(TRIM(first_name), '') as first_name, \n",
-    "           NULLIF(TRIM(address_1), '') as address_1, \n",
-    "           NULLIF(TRIM(address_2), '') as address_2, \n",
-    "           NULLIF(TRIM(city), '') city, \n",
-    "           NULLIF(TRIM(state), '') as state, \n",
-    "           NULLIF(TRIM(zip), '') as zip, \n",
-    "           NULLIF(TRIM(employer), '') as employer, \n",
-    "           NULLIF(TRIM(occupation), '') as occupation\n",
-    "      FROM as_raw_table)\n",
-    "    SELECT row_number() over () as donor_id, * from tmp\"\"\"\n",
-    "athenautils.athena_start_query(q, database=config.DATABASE)\n",
-    "\n",
-    "\n",
-    "q=\"\"\"\n",
-    "CREATE TABLE as_recipients as\n",
-    "    SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM as_raw_table\n",
-    "\"\"\"\n",
-    "athenautils.athena_start_query(q, database=config.DATABASE)\n",
-    "\n",
-    "print('creating contributions table')\n",
-    "\n",
-    "q=\"\"\"\n",
-    "CREATE TABLE as_contributions as\n",
-    "    SELECT reciept_id as contribution_id, \n",
-    "        donors.donor_id as donor_id , \n",
-    "        committee_id as recipient_id, \n",
-    "        report_type, date_parse(date_recieved, '%m/%d/%Y') as date_recieved, \n",
-    "        loan_amount, amount, \n",
-    "        receipt_type, vendor_last_name , \n",
-    "        vendor_first_name, vendor_address_1, vendor_address_2, \n",
-    "        vendor_city, vendor_state, vendor_zip, description, \n",
-    "        election_type, election_year, \n",
-    "        date_parse(report_period_begin, '%m/%d/%Y') as report_period_begin, \n",
-    "        date_parse(report_period_end, '%m/%d/%Y') as report_period_end \n",
-    "    FROM as_raw_table JOIN as_donors donors ON \n",
-    "        coalesce(donors.first_name, '') = coalesce(TRIM(as_raw_table.first_name), '') AND \n",
-    "        coalesce(donors.last_name, '') = coalesce(TRIM(as_raw_table.last_name), '') AND \n",
-    "        coalesce(donors.address_1, '') = coalesce(TRIM(as_raw_table.address_1), '') AND \n",
-    "        coalesce(donors.address_2, '') = coalesce(TRIM(as_raw_table.address_2), '') AND \n",
-    "        coalesce(donors.city, '') = coalesce(TRIM(as_raw_table.city), '') AND \n",
-    "        coalesce(donors.state, '') = coalesce(TRIM(as_raw_table.state), '') AND \n",
-    "        coalesce(donors.employer, '') = coalesce(TRIM(as_raw_table.employer), '') AND \n",
-    "        coalesce(donors.occupation , '')= coalesce(TRIM(as_raw_table.occupation), '') AND \n",
-    "        coalesce(donors.zip, '') = coalesce(TRIM(as_raw_table.zip), '')\"\"\"\n",
-    "\n",
-    "athenautils.athena_start_query(q, database=config.DATABASE)\n",
-    "\n",
-    "q = \"\"\"\n",
-    "CREATE TABLE as_processed_donors AS  \n",
-    "    SELECT donor_id,  \n",
-    "     LOWER(city) AS city,  \n",
-    "     CASE WHEN (first_name IS NULL AND last_name IS NULL) \n",
-    "          THEN NULL \n",
-    "          ELSE LOWER(array_join(filter(array[first_name, last_name], x-> x IS NOT NULL), ' ')) \n",
-    "     END AS name,  \n",
-    "     LOWER(zip) AS zip,  \n",
-    "     LOWER(state) AS state,  \n",
-    "     CASE WHEN (address_1 IS NULL AND address_2 IS NULL) \n",
-    "          THEN NULL \n",
-    "          ELSE LOWER(array_join(filter(array[address_1, address_2], x-> x IS NOT NULL), ' '))\n",
-    "     END AS address,  \n",
-    "     LOWER(occupation) AS occupation, \n",
-    "     LOWER(employer) AS employer, \n",
-    "     first_name is null AS person \n",
-    " FROM as_donors\"\"\"\n",
-    "athenautils.athena_start_query(q, database=config.DATABASE)\n",
-    "\n",
-    "\n",
-    "\n",
-    "\n",
-    "print('done')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!python ../athena_example/athena_init.py"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "conda_python3",
-   "language": "python",
-   "name": "conda_python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}

From e547f88b441bd4cd41471c1c5507594a3c5c2060 Mon Sep 17 00:00:00 2001
From: asajadi <asajadi@gmail.com>
Date: Sat, 8 May 2021 17:03:12 -0400
Subject: [PATCH 16/19] removing utils.py

---
 .gitignore              |   1 -
 athena_example/utils.py | 138 ----------------------------------------
 2 files changed, 139 deletions(-)
 delete mode 100644 athena_example/utils.py

diff --git a/.gitignore b/.gitignore
index a29de92b..3fb24683 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,4 +26,3 @@ ENV
 distpgsql_init_db.py
 pgsql_example/pgsql_init_db.py
 .idea
-.ipynb_checkpoints*
diff --git a/athena_example/utils.py b/athena_example/utils.py
deleted file mode 100644
index 80922548..00000000
--- a/athena_example/utils.py
+++ /dev/null
@@ -1,138 +0,0 @@
-from __future__ import print_function
-import re
-import boto3
-import botocore
-import sys
-import datetime
-import os
-import time
-import pandas as pd
-from six import string_types
-import sys
-pyver = sys.version_info[0]
-
-if pyver<3:
-    from StringIO import StringIO as SomethingIO
-    from urlparse import urlparse
-else:
-    from io import BytesIO as SomethingIO
-    from urllib.parse import urlparse
-    
-sys.path.insert(0, '../athena_example/')
-import config
-
-s3 = boto3.client('s3', region_name=config.REGION, 
-                      aws_access_key_id=config.ACCESS_KEY_ID, aws_secret_access_key=config.SECRET_ACCESS_KEY)
-  
-athena = boto3.client('athena', region_name=config.REGION, 
-                      aws_access_key_id=config.ACCESS_KEY_ID, aws_secret_access_key=config.SECRET_ACCESS_KEY)
-
-def athena_to_panda(query, database=config.DATABASE, output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, **kwargs):
-    query_execution_id = athena_start_query(query, database, output_location, region, workgroup, wait_until_finished=True)
-    df = pandas_read_csv(os.path.join(output_location, query_execution_id+'.csv'), **kwargs)
-    return df
-
-
-def athena_start_query(query, database=config.DATABASE, output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, wait_until_finished=True):
-    query_execution_id = athena.start_query_execution(
-        QueryString=query,
-        QueryExecutionContext={
-            'Database': database
-        },    
-        WorkGroup=workgroup,
-        ResultConfiguration={
-            "OutputLocation": output_location
-        }
-    )['QueryExecutionId']
-
-    seconds_to_wait = 1
-
-    if wait_until_finished:
-        while True:
-            time.sleep(seconds_to_wait)
-            seconds_to_wait += 1
-#             seconds_to_wait *= 2
-
-            execution = athena.get_query_execution(
-                QueryExecutionId=query_execution_id
-            )
-
-            if execution['QueryExecution']['Status']['State'] not in ['QUEUED', 'RUNNING']:
-                break
-
-        if execution['QueryExecution']['Status']['State'] != 'SUCCEEDED':
-            raise Exception("Athena query failed: %s" % ( execution['QueryExecution']['Status']['StateChangeReason'],), query_execution_id)
-
-    return query_execution_id
-
-# Copied from https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
-# Import it instead, when it's updated.
-def is_s3_url(url):
-    """Check for an s3, s3n, or s3a url"""
-    try:
-        return urlparse(url).scheme in ["s3", "s3n", "s3a"]
-    except Exception:
-        return False
-    
-def seperate_bucket_key(url):
-    m = re.match('s3://([^/]+)/(.*)', url)
-    return m.group(1), m.group(2)
-
-def list_all(path):
-    if is_s3_url(path):
-        bucket, key = seperate_bucket_key(path)
-        objects = s3.list_objects_v2(Bucket=bucket, Prefix=key)
-        return [key['Key'] for key in objects['Contents']]
-    from os import listdir
-    from os.path import isfile, join
-    return listdir(path)
-    
-
-def pandas_read_csv(filepath_or_buffer, verbose=True, **kwargs):
-    return pd.read_csv(filepath_or_buffer, **kwargs)
-
-def reader(filename, Range='string', verbose=True):
-    '''
-        Range: look at: https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
-    '''
-    log ("Reading {}".format(filename), verbose=verbose)
-    if is_s3_url(filename):
-        bucket, key = seperate_bucket_key(filename)
-        obj=s3.get_object(Bucket=bucket, Key=key, Range=Range)
-        return obj['Body']
-
-def write(body, filename):
-    bucket, key = seperate_bucket_key(filename)
-    s3.put_object(Bucket=bucket, Key=key, Body=body)
-    return
-        
-    
-def file_exists(filename):
-    bucket, key = seperate_bucket_key(filename)
-    try:
-        s3.get_object(Bucket=bucket, Key=key)
-    except botocore.exceptions.ClientError as e:
-        if e.response['Error']['Code']=='NoSuchKey':
-            return False
-        else:
-            # Something else has gone wrong.
-            raise
-    else:
-        return True
-    
-    
-def log(outstr, logfile_name=config.LOG_FILE, timestamped=True, verbose=True, quiet=False):
-    if verbose == False:
-        return
-    if timestamped:
-        outstr = "[%s]\t%s\n" % (str(datetime.datetime.now()) , outstr)
-    else:
-        outstr = "%s\n" % (outstr,)
-
-    with open(logfile_name, "a") as logfile:
-        logfile.write(outstr)
-
-    if not quiet:
-        sys.stdout.write(outstr);
-        sys.stdout.flush()
-# Print iterations progress

From 2667c9ea2c7effd8ace8053bccd4e3c1c5671e1f Mon Sep 17 00:00:00 2001
From: asajadi <asajadi@gmail.com>
Date: Sat, 8 May 2021 17:25:07 -0400
Subject: [PATCH 17/19] renaming tables

---
 athena_example/athena_example.py | 64 ++++++++++++++++----------------
 athena_example/athena_init.py    | 54 +++++++++++++--------------
 2 files changed, 59 insertions(+), 59 deletions(-)

diff --git a/athena_example/athena_example.py b/athena_example/athena_example.py
index c1c7fb3c..04837157 100644
--- a/athena_example/athena_example.py
+++ b/athena_example/athena_example.py
@@ -113,7 +113,7 @@ def cluster_ids(clustered_dupes):
     # We did a fair amount of preprocessing of the fields in
     # `athena_init_db.py`    
     DONOR_SELECT = """SELECT donor_id, city, name, zip, state, address
-                      from as_processed_donors"""
+                      from processed_donors"""
 
     # ## Training
 
@@ -191,13 +191,13 @@ def cluster_ids(clustered_dupes):
 
     # To run blocking on such a large set of data, we create a separate table
     # that contains blocking keys and record ids
-    print('creating as_blocking_map database')
-    athenautils.drop_external_table("as_blocking_map", 
-                                    location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map'),
+    print('creating blocking_map database')
+    athenautils.drop_external_table("blocking_map", 
+                                    location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'blocking_map'),
                                     database=config.DATABASE)
 
     q="""
-    CREATE EXTERNAL TABLE as_blocking_map     
+    CREATE EXTERNAL TABLE blocking_map     
         (block_key VARCHAR(200), donor_id INTEGER)
     ROW FORMAT DELIMITED
       FIELDS TERMINATED BY '\t'
@@ -208,7 +208,7 @@ def cluster_ids(clustered_dupes):
         'classification'='csv', 
         --'skip.header.line.count'='1',  
         'serialization.null.format'='')
-    """.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map') 
+    """.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'blocking_map') 
     athenautils.athena_start_query(q, database=config.DATABASE)
 
     # If dedupe learned a Index Predicate, we have to take a pass
@@ -219,7 +219,7 @@ def cluster_ids(clustered_dupes):
     # This never runs, index_fields is empty, possible bug?
     for field in deduper.fingerprinter.index_fields:
         q = """
-        SELECT DISTINCT {field} FROM as_processed_donors
+        SELECT DISTINCT {field} FROM processed_donors
         WHERE {field} IS NOT NULL
         """.format(field=field)
         cur = dict_cursor_execute(q, databse=config.DATABASE)
@@ -236,7 +236,7 @@ def cluster_ids(clustered_dupes):
 
     b_data = deduper.fingerprinter(full_data)
     athenautils.write_many(b_data, 
-                           filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_blocking_map/blocking.csv'))
+                           filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'blocking_map/blocking.csv'))
 
 
     # select unique pairs to compare
@@ -250,12 +250,12 @@ def cluster_ids(clustered_dupes):
                       ARRAY[ b.city, b.name, b.zip, b.state, b.address])
                   AS JSON))
         FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west
-             from as_blocking_map as l
-             INNER JOIN as_blocking_map as r
+             from blocking_map as l
+             INNER JOIN blocking_map as r
              using (block_key)
              where l.donor_id < r.donor_id) ids
-        INNER JOIN as_processed_donors a on ids.east=a.donor_id
-        INNER JOIN as_processed_donors b on ids.west=b.donor_id
+        INNER JOIN processed_donors a on ids.east=a.donor_id
+        INNER JOIN processed_donors b on ids.west=b.donor_id
        """
     read_cur = cursor_execute(q, database=config.DATABASE)
 
@@ -266,14 +266,14 @@ def cluster_ids(clustered_dupes):
     clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)),
                                       threshold=0.5)
 
-#     athenautils.athena_start_query("DROP TABLE IF EXISTS as_entity_map", database=config.DATABASE)
-    athenautils.drop_external_table("as_entity_map", 
-                                    location='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/'), 
+#     athenautils.athena_start_query("DROP TABLE IF EXISTS entity_map", database=config.DATABASE)
+    athenautils.drop_external_table("entity_map", 
+                                    location='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'entity_map/'), 
                                     database=config.DATABASE)
     
-    print('creating as_entity_map database')
+    print('creating entity_map database')
     q="""
-    CREATE EXTERNAL TABLE as_entity_map     
+    CREATE EXTERNAL TABLE entity_map     
         (donor_id INTEGER, canon_id INTEGER, 
          cluster_score FLOAT)
     ROW FORMAT DELIMITED
@@ -285,11 +285,11 @@ def cluster_ids(clustered_dupes):
         'classification'='csv', 
         --'skip.header.line.count'='1',  
         'serialization.null.format'='')
-    """.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map') 
+    """.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'entity_map') 
     athenautils.athena_start_query(q, database=config.DATABASE) 
 
     athenautils.write_many(cluster_ids(clustered_dupes),
-                          filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_entity_map/entity_map.csv'))
+                          filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'entity_map/entity_map.csv'))
 
     # Print out the number of duplicates found
     print('# duplicate sets')
@@ -303,28 +303,28 @@ def cluster_ids(clustered_dupes):
 
     locale.setlocale(locale.LC_ALL, 'en_CA.UTF-8')  # for pretty printing numbers
     
-    athenautils.athena_start_query("DROP TABLE IF EXISTS as_e_map", database=config.DATABASE)
+    athenautils.athena_start_query("DROP TABLE IF EXISTS e_map", database=config.DATABASE)
     
     q = """
-        CREATE TABLE as_e_map as 
-        SELECT COALESCE(canon_id, as_entity_map.donor_id) AS canon_id, as_entity_map.donor_id 
-        FROM as_entity_map 
-        RIGHT JOIN as_donors USING(donor_id)        
+        CREATE TABLE e_map as 
+        SELECT COALESCE(canon_id, entity_map.donor_id) AS canon_id, entity_map.donor_id 
+        FROM entity_map 
+        RIGHT JOIN donors USING(donor_id)        
         """    
     athenautils.athena_start_query(q, database=config.DATABASE)
     
     q = """
-        SELECT array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,   
+        SELECT array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name,   
             donation_totals.totals AS totals 
-        FROM as_donors INNER JOIN 
+        FROM donors INNER JOIN 
             (SELECT canon_id, SUM(cast (amount as double)) AS totals 
-            FROM as_contributions INNER JOIN as_e_map 
+            FROM contributions INNER JOIN e_map 
             USING (donor_id) 
             GROUP BY (canon_id) 
             ORDER BY totals 
             DESC LIMIT 10) 
             AS donation_totals 
-        ON as_donors.donor_id = donation_totals.canon_id
+        ON donors.donor_id = donation_totals.canon_id
         ORDER BY totals DESC
     """
     cur = dict_cursor_execute(q, database=config.DATABASE)
@@ -339,10 +339,10 @@ def cluster_ids(clustered_dupes):
     q = """
         with donorscontributions as(
 
-            SELECT as_donors.donor_id, 
-                array_join(filter(array[as_donors.first_name, as_donors.last_name], x-> x IS NOT NULL), ' ') AS name,
-                cast(as_contributions.amount as double) as amount
-            FROM as_donors INNER JOIN as_contributions 
+            SELECT donors.donor_id, 
+                array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name,
+                cast(contributions.amount as double) as amount
+            FROM donors INNER JOIN contributions 
                 USING (donor_id) 
             )
         SELECT name, sum(amount) AS totals  
diff --git a/athena_example/athena_init.py b/athena_example/athena_init.py
index 45a5e254..099c5489 100644
--- a/athena_example/athena_init.py
+++ b/athena_example/athena_init.py
@@ -8,7 +8,7 @@
 [athena_example.py](athena_example.py).
  
 Tables created:
-* as_raw_table - raw import of entire CSV file
+* raw_table - raw import of entire CSV file
 * donors - all distinct donors based on name and address
 * recipients - all distinct campaign contribution recipients
 * contributions - contribution amounts tied to donor and recipients tables
@@ -51,17 +51,17 @@
 
 
 print('importing raw data from csv...')
-athenautils.drop_external_table("as_raw_table", 
-                                location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table'),
+athenautils.drop_external_table("raw_table", 
+                                location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'raw_table'),
                                 database=config.DATABASE)    
-athenautils.athena_start_query("DROP TABLE IF EXISTS as_donors", database=config.DATABASE)
-athenautils.athena_start_query("DROP TABLE IF EXISTS as_recipients", database=config.DATABASE)
-athenautils.athena_start_query("DROP TABLE IF EXISTS as_contributions", database=config.DATABASE)
-athenautils.athena_start_query("DROP TABLE IF EXISTS as_processed_donors", database=config.DATABASE)
+athenautils.athena_start_query("DROP TABLE IF EXISTS donors", database=config.DATABASE)
+athenautils.athena_start_query("DROP TABLE IF EXISTS recipients", database=config.DATABASE)
+athenautils.athena_start_query("DROP TABLE IF EXISTS contributions", database=config.DATABASE)
+athenautils.athena_start_query("DROP TABLE IF EXISTS processed_donors", database=config.DATABASE)
 
 
 q=r"""
-CREATE EXTERNAL TABLE as_raw_table 
+CREATE EXTERNAL TABLE raw_table 
     (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), 
     address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), 
     state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), 
@@ -85,7 +85,7 @@
     'classification'='csv', 
     'skip.header.line.count'='1',  
     'serialization.null.format'='')
-""".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'as_raw_table') 
+""".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'raw_table') 
 athenautils.athena_start_query(q, database=config.DATABASE)
 
 
@@ -93,7 +93,7 @@
                         error_bad_lines=False, warn_bad_lines=True, dtype=str, keep_default_na=False, na_values=[''],
                         chunksize=config.BUFFERSIZE)
 chunkcount = 0
-filename=os.path.join("s3://", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'as_raw_table', os.path.splitext(contributions_txt_file)[0]+'.csv')
+filename=os.path.join("s3://", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'raw_table', os.path.splitext(contributions_txt_file)[0]+'.csv')
 for df in df_cursor: 
     # Remove the very few records that mess up the demo 
     # (demo purposes only! Don't do something like this in production)
@@ -125,7 +125,7 @@
     
 print('creating donors table...')
 q="""
-CREATE TABLE as_donors as
+CREATE TABLE donors as
     with tmp as
       (SELECT DISTINCT 
            NULLIF(TRIM(last_name), '') as last_name, 
@@ -137,21 +137,21 @@
            NULLIF(TRIM(zip), '') as zip, 
            NULLIF(TRIM(employer), '') as employer, 
            NULLIF(TRIM(occupation), '') as occupation
-      FROM as_raw_table)
+      FROM raw_table)
     SELECT row_number() over () as donor_id, * from tmp"""
 athenautils.athena_start_query(q, database=config.DATABASE)
 
 
 q="""
-CREATE TABLE as_recipients as
-    SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM as_raw_table
+CREATE TABLE recipients as
+    SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM raw_table
 """
 athenautils.athena_start_query(q, database=config.DATABASE)
 
 print('creating contributions table')
 
 q="""
-CREATE TABLE as_contributions as
+CREATE TABLE contributions as
     SELECT reciept_id as contribution_id, 
         donors.donor_id as donor_id , 
         committee_id as recipient_id, 
@@ -163,21 +163,21 @@
         election_type, election_year, 
         date_parse(report_period_begin, '%m/%d/%Y') as report_period_begin, 
         date_parse(report_period_end, '%m/%d/%Y') as report_period_end 
-    FROM as_raw_table JOIN as_donors donors ON 
-        coalesce(donors.first_name, '') = coalesce(TRIM(as_raw_table.first_name), '') AND 
-        coalesce(donors.last_name, '') = coalesce(TRIM(as_raw_table.last_name), '') AND 
-        coalesce(donors.address_1, '') = coalesce(TRIM(as_raw_table.address_1), '') AND 
-        coalesce(donors.address_2, '') = coalesce(TRIM(as_raw_table.address_2), '') AND 
-        coalesce(donors.city, '') = coalesce(TRIM(as_raw_table.city), '') AND 
-        coalesce(donors.state, '') = coalesce(TRIM(as_raw_table.state), '') AND 
-        coalesce(donors.employer, '') = coalesce(TRIM(as_raw_table.employer), '') AND 
-        coalesce(donors.occupation , '')= coalesce(TRIM(as_raw_table.occupation), '') AND 
-        coalesce(donors.zip, '') = coalesce(TRIM(as_raw_table.zip), '')"""
+    FROM raw_table JOIN donors donors ON 
+        coalesce(donors.first_name, '') = coalesce(TRIM(raw_table.first_name), '') AND 
+        coalesce(donors.last_name, '') = coalesce(TRIM(raw_table.last_name), '') AND 
+        coalesce(donors.address_1, '') = coalesce(TRIM(raw_table.address_1), '') AND 
+        coalesce(donors.address_2, '') = coalesce(TRIM(raw_table.address_2), '') AND 
+        coalesce(donors.city, '') = coalesce(TRIM(raw_table.city), '') AND 
+        coalesce(donors.state, '') = coalesce(TRIM(raw_table.state), '') AND 
+        coalesce(donors.employer, '') = coalesce(TRIM(raw_table.employer), '') AND 
+        coalesce(donors.occupation , '')= coalesce(TRIM(raw_table.occupation), '') AND 
+        coalesce(donors.zip, '') = coalesce(TRIM(raw_table.zip), '')"""
 
 athenautils.athena_start_query(q, database=config.DATABASE)
 
 q = """
-CREATE TABLE as_processed_donors AS  
+CREATE TABLE processed_donors AS  
     SELECT donor_id,  
      LOWER(city) AS city,  
      CASE WHEN (first_name IS NULL AND last_name IS NULL) 
@@ -193,7 +193,7 @@
      LOWER(occupation) AS occupation, 
      LOWER(employer) AS employer, 
      first_name is null AS person 
- FROM as_donors"""
+ FROM donors"""
 athenautils.athena_start_query(q, database=config.DATABASE)
 
 

From 8a28655e4f1b863e691cd583d87f0969d5184f9f Mon Sep 17 00:00:00 2001
From: asajadi <asajadi@gmail.com>
Date: Sat, 8 May 2021 17:33:42 -0400
Subject: [PATCH 18/19] modifying requirements.txt

---
 athena_example/requirements.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/athena_example/requirements.txt b/athena_example/requirements.txt
index 18c098ae..5dcfecc9 100644
--- a/athena_example/requirements.txt
+++ b/athena_example/requirements.txt
@@ -1 +1,3 @@
-mysqlclient
+pandas
+boto3
+dedupe

From 736ff0b8520ebe117990bdaa23fbbd7916e28c13 Mon Sep 17 00:00:00 2001
From: asajadi <asajadi@gmail.com>
Date: Sat, 8 May 2021 22:57:02 -0400
Subject: [PATCH 19/19] linting

---
 athena_example/athena_example.py | 290 +++++++++++++++++--------------
 athena_example/athena_init.py    | 283 +++++++++++++++++-------------
 athena_example/athenautils.py    | 229 +++++++++++++++---------
 3 files changed, 468 insertions(+), 334 deletions(-)

diff --git a/athena_example/athena_example.py b/athena_example/athena_example.py
index 04837157..d275af51 100644
--- a/athena_example/athena_example.py
+++ b/athena_example/athena_example.py
@@ -1,4 +1,3 @@
-
 """
 This is an example of working with very large data. There are about
 700,000 unduplicated donors in this database of Illinois political
@@ -16,46 +15,59 @@
 [csv_example](csv_example.html)
 """
 
-# There is a little bit difference between the result 
+# There is a little bit difference between the result
 # of this module and the athena one. The reason is due to
-# Some special (and mostly erroneous) characters, such as \a .. 
+# Some special (and mostly erroneous) characters, such as \a ..
 # Which are dealt with differently by athena and athena/panda
 
+import athenautils
+import config
 import sys
 import os
-import itertools
 import time
 import logging
 import optparse
 import locale
 import json
-from io import StringIO
-import csv
-import pandas as pd
 
-import boto3
 import dedupe
 import dedupe.backport
-sys.path.insert(0, '../athena_example/')
-import config
-sys.path.insert(0, '../athena_example/')
-import athenautils
+
+sys.path.insert(0, "../athena_example/")
+
+sys.path.insert(0, "../athena_example/")
+
 
 def cursor_execute(query, database):
-    '''
+    """
     The MySQL compatible Cursor
-    '''
-    return athenautils.cursor_execute(query, database=database, 
-                                      cursortype='tuple', buffersize=config.BUFFERSIZE,
-                                      escapechar=None, keep_default_na=False, na_values=[''])
+    """
+    return athenautils.cursor_execute(
+        query,
+        database=database,
+        cursortype="tuple",
+        buffersize=config.BUFFERSIZE,
+        escapechar=None,
+        keep_default_na=False,
+        na_values=[""],
+    )
+
 
 def dict_cursor_execute(query, database):
-    '''
+    """
     The MySQL compatible DicCursor
-    '''
-    return athenautils.cursor_execute(query, database=database, 
-                                      cursortype='dict', buffersize=config.BUFFERSIZE,
-                                      escapechar=None, keep_default_na=False, na_values=[''])
+    """
+    return athenautils.cursor_execute(
+        query,
+        database=database,
+        cursortype="dict",
+        buffersize=config.BUFFERSIZE,
+        escapechar=None,
+        keep_default_na=False,
+        na_values=[""],
+    )
+
+
 def record_pairs(result_set):
     for i, row in enumerate(result_set):
         a_record_id, a_record, b_record_id, b_record = row
@@ -76,18 +88,22 @@ def cluster_ids(clustered_dupes):
             yield donor_id, cluster_id, score
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
 
-    ## Logging
+    # Logging
 
     # Dedupe uses Python logging to show or suppress verbose output. Added
     # for convenience.  To enable verbose output, run `python
     # examples/athena_example/athena_example.py -v`
-    
+
     optp = optparse.OptionParser()
-    optp.add_option('-v', '--verbose', dest='verbose', action='count',
-                    help='Increase verbosity (specify multiple times for more)'
-                    )
+    optp.add_option(
+        "-v",
+        "--verbose",
+        dest="verbose",
+        action="count",
+        help="Increase verbosity (specify multiple times for more)",
+    )
     (opts, args) = optp.parse_args()
     log_level = logging.WARNING
     if opts.verbose:
@@ -96,14 +112,10 @@ def cluster_ids(clustered_dupes):
         elif opts.verbose >= 2:
             log_level = logging.DEBUG
 
-
     logging.getLogger().setLevel(log_level)
 
-    
-
-
-    settings_file = 'athena_example_settings'
-    training_file = 'athena_example_training.json'
+    settings_file = "athena_example_settings"
+    training_file = "athena_example_training.json"
 
     start_time = time.time()
 
@@ -111,15 +123,15 @@ def cluster_ids(clustered_dupes):
     # in campaign donor info.
     #
     # We did a fair amount of preprocessing of the fields in
-    # `athena_init_db.py`    
+    # `athena_init_db.py`
     DONOR_SELECT = """SELECT donor_id, city, name, zip, state, address
                       from processed_donors"""
 
     # ## Training
 
     if os.path.exists(settings_file):
-        print('reading from ', settings_file)
-        with open(settings_file, 'rb') as sf:
+        print("reading from ", settings_file)
+        with open(settings_file, "rb") as sf:
             deduper = dedupe.StaticDedupe(sf, num_cores=4)
     else:
         # Define the fields dedupe will pay attention to
@@ -127,13 +139,13 @@ def cluster_ids(clustered_dupes):
         # The address, city, and zip fields are often missing, so we'll
         # tell dedupe that, and we'll learn a model that take that into
         # account
-        fields = [{'field': 'name', 'type': 'String'},
-                  {'field': 'address', 'type': 'String',
-                   'has missing': True},
-                  {'field': 'city', 'type': 'ShortString', 'has missing': True},
-                  {'field': 'state', 'type': 'ShortString', 'has missing': True},
-                  {'field': 'zip', 'type': 'ShortString', 'has missing': True},
-                  ]
+        fields = [
+            {"field": "name", "type": "String"},
+            {"field": "address", "type": "String", "has missing": True},
+            {"field": "city", "type": "ShortString", "has missing": True},
+            {"field": "state", "type": "ShortString", "has missing": True},
+            {"field": "zip", "type": "ShortString", "has missing": True},
+        ]
 
         # Create a new deduper object and pass our data model to it.
         deduper = dedupe.Dedupe(fields, num_cores=4)
@@ -141,7 +153,6 @@ def cluster_ids(clustered_dupes):
         # We will sample pairs from the entire donor table for training
         cur = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE)
         temp_d = {i: row for i, row in enumerate(cur)}
-            
 
         # If we have training data saved from a previous run of dedupe,
         # look for it an load it in.
@@ -149,7 +160,7 @@ def cluster_ids(clustered_dupes):
         # __Note:__ if you want to train from
         # scratch, delete the training_file
         if os.path.exists(training_file):
-            print('reading labeled examples from ', training_file)
+            print("reading labeled examples from ", training_file)
             with open(training_file) as tf:
                 deduper.prepare_training(temp_d, training_file=tf)
         else:
@@ -159,7 +170,7 @@ def cluster_ids(clustered_dupes):
 
         # ## Active learning
 
-        print('starting active labeling...')
+        print("starting active labeling...")
         # Starts the training loop. Dedupe will find the next pair of records
         # it is least certain about and ask you to label them as duplicates
         # or not.
@@ -168,7 +179,7 @@ def cluster_ids(clustered_dupes):
         # press 'f' when you are finished
         dedupe.convenience.console_label(deduper)
         # When finished, save our labeled, training pairs to disk
-        with open(training_file, 'w') as tf:
+        with open(training_file, "w") as tf:
             deduper.write_training(tf)
 
         # Notice our the argument here
@@ -178,7 +189,7 @@ def cluster_ids(clustered_dupes):
         # too many blocks and too many comparisons.
         deduper.train(recall=0.90)
 
-        with open(settings_file, 'wb') as sf:
+        with open(settings_file, "wb") as sf:
             deduper.write_settings(sf)
 
         # We can now remove some of the memory hobbing objects we used
@@ -187,66 +198,77 @@ def cluster_ids(clustered_dupes):
 
     # ## Blocking
 
-    print('blocking...')
+    print("blocking...")
 
     # To run blocking on such a large set of data, we create a separate table
     # that contains blocking keys and record ids
-    print('creating blocking_map database')
-    athenautils.drop_external_table("blocking_map", 
-                                    location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'blocking_map'),
-                                    database=config.DATABASE)
+    print("creating blocking_map database")
+    athenautils.drop_external_table(
+        "blocking_map",
+        location="s3://{}/{}".format(
+            config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY + "blocking_map"
+        ),
+        database=config.DATABASE,
+    )
 
-    q="""
-    CREATE EXTERNAL TABLE blocking_map     
+    q = """
+    CREATE EXTERNAL TABLE blocking_map
         (block_key VARCHAR(200), donor_id INTEGER)
     ROW FORMAT DELIMITED
       FIELDS TERMINATED BY '\t'
-      LINES TERMINATED BY '\n'  
+      LINES TERMINATED BY '\n'
     LOCATION
-        's3://{}/{}' 
+        's3://{}/{}'
     TBLPROPERTIES (
-        'classification'='csv', 
-        --'skip.header.line.count'='1',  
+        'classification'='csv',
+        --'skip.header.line.count'='1',
         'serialization.null.format'='')
-    """.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'blocking_map') 
+    """.format(
+        config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY + "blocking_map"
+    )
     athenautils.athena_start_query(q, database=config.DATABASE)
 
     # If dedupe learned a Index Predicate, we have to take a pass
     # through the data and create indices.
-    print('creating inverted index')
+    print("creating inverted index")
 
-    # Armin: 
+    # Armin:
     # This never runs, index_fields is empty, possible bug?
     for field in deduper.fingerprinter.index_fields:
         q = """
         SELECT DISTINCT {field} FROM processed_donors
         WHERE {field} IS NOT NULL
-        """.format(field=field)
+        """.format(
+            field=field
+        )
         cur = dict_cursor_execute(q, databse=config.DATABASE)
         field_data = (row[field] for row in cur)
         deduper.fingerprinter.index(field_data, field)
-     
 
     # Now we are ready to write our blocking map table by creating a
     # generator that yields unique `(block_key, donor_id)` tuples.
-    print('writing blocking map')
-    
-    read_cur  = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE)
-    full_data = ((row['donor_id'], row) for row in read_cur)
+    print("writing blocking map")
 
-    b_data = deduper.fingerprinter(full_data)
-    athenautils.write_many(b_data, 
-                           filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'blocking_map/blocking.csv'))
+    read_cur = dict_cursor_execute(DONOR_SELECT, database=config.DATABASE)
+    full_data = ((row["donor_id"], row) for row in read_cur)
 
+    b_data = deduper.fingerprinter(full_data)
+    athenautils.write_many(
+        b_data,
+        filename="s3://{}/{}".format(
+            config.DATABASE_BUCKET,
+            config.DATABASE_ROOT_KEY + "blocking_map/blocking.csv",
+        ),
+    )
 
     # select unique pairs to compare
-    q="""
+    q = """
         SELECT a.donor_id,
             json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'],
                                   ARRAY[ a.city, a.name, a.zip, a.state, a.address])
                         AS JSON)),
             b.donor_id,
-            json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'], 
+            json_format(CAST (MAP(ARRAY['city', 'name', 'zip', 'state', 'address'],
                       ARRAY[ b.city, b.name, b.zip, b.state, b.address])
                   AS JSON))
         FROM (SELECT DISTINCT l.donor_id as east, r.donor_id as west
@@ -259,40 +281,51 @@ def cluster_ids(clustered_dupes):
        """
     read_cur = cursor_execute(q, database=config.DATABASE)
 
-
     # ## Clustering
 
-    print('clustering...')
-    clustered_dupes = deduper.cluster(deduper.score(record_pairs(read_cur)),
-                                      threshold=0.5)
-
-#     athenautils.athena_start_query("DROP TABLE IF EXISTS entity_map", database=config.DATABASE)
-    athenautils.drop_external_table("entity_map", 
-                                    location='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'entity_map/'), 
-                                    database=config.DATABASE)
-    
-    print('creating entity_map database')
-    q="""
-    CREATE EXTERNAL TABLE entity_map     
-        (donor_id INTEGER, canon_id INTEGER, 
+    print("clustering...")
+    clustered_dupes = deduper.cluster(
+        deduper.score(record_pairs(read_cur)), threshold=0.5
+    )
+
+    #     athenautils.athena_start_query("DROP TABLE IF EXISTS entity_map", database=config.DATABASE)
+    athenautils.drop_external_table(
+        "entity_map",
+        location="s3://{}/{}".format(
+            config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY + "entity_map/"
+        ),
+        database=config.DATABASE,
+    )
+
+    print("creating entity_map database")
+    q = """
+    CREATE EXTERNAL TABLE entity_map
+        (donor_id INTEGER, canon_id INTEGER,
          cluster_score FLOAT)
     ROW FORMAT DELIMITED
       FIELDS TERMINATED BY '\t'
-      LINES TERMINATED BY '\n'  
+      LINES TERMINATED BY '\n'
     LOCATION
-        's3://{}/{}' 
+        's3://{}/{}'
     TBLPROPERTIES (
-        'classification'='csv', 
-        --'skip.header.line.count'='1',  
+        'classification'='csv',
+        --'skip.header.line.count'='1',
         'serialization.null.format'='')
-    """.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'entity_map') 
-    athenautils.athena_start_query(q, database=config.DATABASE) 
+    """.format(
+        config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY + "entity_map"
+    )
+    athenautils.athena_start_query(q, database=config.DATABASE)
 
-    athenautils.write_many(cluster_ids(clustered_dupes),
-                          filename='s3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'entity_map/entity_map.csv'))
+    athenautils.write_many(
+        cluster_ids(clustered_dupes),
+        filename="s3://{}/{}".format(
+            config.DATABASE_BUCKET,
+            config.DATABASE_ROOT_KEY + "entity_map/entity_map.csv",
+        ),
+    )
 
     # Print out the number of duplicates found
-    print('# duplicate sets')
+    print("# duplicate sets")
 
     # ## Payoff
 
@@ -301,29 +334,32 @@ def cluster_ids(clustered_dupes):
     #
     # For example, let's see who the top 10 donors are.
 
-    locale.setlocale(locale.LC_ALL, 'en_CA.UTF-8')  # for pretty printing numbers
-    
-    athenautils.athena_start_query("DROP TABLE IF EXISTS e_map", database=config.DATABASE)
-    
+    # for pretty printing numbers
+    locale.setlocale(locale.LC_ALL, "en_CA.UTF-8")
+
+    athenautils.athena_start_query(
+        "DROP TABLE IF EXISTS e_map", database=config.DATABASE
+    )
+
     q = """
-        CREATE TABLE e_map as 
-        SELECT COALESCE(canon_id, entity_map.donor_id) AS canon_id, entity_map.donor_id 
-        FROM entity_map 
-        RIGHT JOIN donors USING(donor_id)        
-        """    
+        CREATE TABLE e_map as
+        SELECT COALESCE(canon_id, entity_map.donor_id) AS canon_id, entity_map.donor_id
+        FROM entity_map
+        RIGHT JOIN donors USING(donor_id)
+        """
     athenautils.athena_start_query(q, database=config.DATABASE)
-    
+
     q = """
-        SELECT array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name,   
-            donation_totals.totals AS totals 
-        FROM donors INNER JOIN 
-            (SELECT canon_id, SUM(cast (amount as double)) AS totals 
-            FROM contributions INNER JOIN e_map 
-            USING (donor_id) 
-            GROUP BY (canon_id) 
-            ORDER BY totals 
-            DESC LIMIT 10) 
-            AS donation_totals 
+        SELECT array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name,
+            donation_totals.totals AS totals
+        FROM donors INNER JOIN
+            (SELECT canon_id, SUM(cast (amount as double)) AS totals
+            FROM contributions INNER JOIN e_map
+            USING (donor_id)
+            GROUP BY (canon_id)
+            ORDER BY totals
+            DESC LIMIT 10)
+            AS donation_totals
         ON donors.donor_id = donation_totals.canon_id
         ORDER BY totals DESC
     """
@@ -331,31 +367,31 @@ def cluster_ids(clustered_dupes):
 
     print("Top Donors (deduped)")
     for row in cur:
-        row['totals'] = locale.currency(row['totals'], grouping=True)
-        print('%(totals)20s: %(name)s' % row)
+        row["totals"] = locale.currency(row["totals"], grouping=True)
+        print("%(totals)20s: %(name)s" % row)
 
     # Compare this to what we would have gotten if we hadn't done any
     # deduplication
     q = """
         with donorscontributions as(
 
-            SELECT donors.donor_id, 
+            SELECT donors.donor_id,
                 array_join(filter(array[donors.first_name, donors.last_name], x-> x IS NOT NULL), ' ') AS name,
                 cast(contributions.amount as double) as amount
-            FROM donors INNER JOIN contributions 
-                USING (donor_id) 
+            FROM donors INNER JOIN contributions
+                USING (donor_id)
             )
-        SELECT name, sum(amount) AS totals  
+        SELECT name, sum(amount) AS totals
         FROM donorscontributions
         GROUP BY donor_id, name
-        ORDER BY totals DESC 
+        ORDER BY totals DESC
         LIMIT 10
     """
     cur = dict_cursor_execute(q, database=config.DATABASE)
 
     print("Top Donors (raw)")
     for row in cur:
-        row['totals'] = locale.currency(row['totals'], grouping=True)
-        print('%(totals)20s: %(name)s' % row)
+        row["totals"] = locale.currency(row["totals"], grouping=True)
+        print("%(totals)20s: %(name)s" % row)
 
-    print('ran in', time.time() - start_time, 'seconds')
+    print("ran in", time.time() - start_time, "seconds")
diff --git a/athena_example/athena_init.py b/athena_example/athena_init.py
index 099c5489..42108846 100644
--- a/athena_example/athena_init.py
+++ b/athena_example/athena_init.py
@@ -3,10 +3,10 @@
 This is a setup script for athena_example.  It downloads a zip file of
 Illinois campaign contributions and loads them into a Athena database
 named 'contributions'.
- 
+
 __Note:__ You will need to run this script first before execuing
 [athena_example.py](athena_example.py).
- 
+
 Tables created:
 * raw_table - raw import of entire CSV file
 * donors - all distinct donors based on name and address
@@ -14,189 +14,226 @@
 * contributions - contribution amounts tied to donor and recipients tables
 """
 
+import athenautils
 import os
 import zipfile
-import warnings
 import pandas as pd
 import numpy as np
 from urllib.request import urlopen
-import boto3
 import config
 import csv
 import sys
-sys.path.insert(0, '../athena_example/')
-import athenautils
 
+sys.path.insert(0, "../athena_example/")
 
-contributions_zip_file = 'Illinois-campaign-contributions.txt.zip'
-contributions_txt_file = 'Illinois-campaign-contributions.txt'
 
-if not os.path.exists(contributions_zip_file) :
-    print('downloading', contributions_zip_file, '(~60mb) ...')
-    u = urlopen('https://s3.amazonaws.com/dedupe-data/Illinois-campaign-contributions.txt.zip')
-    localFile = open(contributions_zip_file, 'wb')
+contributions_zip_file = "Illinois-campaign-contributions.txt.zip"
+contributions_txt_file = "Illinois-campaign-contributions.txt"
+
+if not os.path.exists(contributions_zip_file):
+    print("downloading", contributions_zip_file, "(~60mb) ...")
+    u = urlopen(
+        "https://s3.amazonaws.com/dedupe-data/Illinois-campaign-contributions.txt.zip"
+    )
+    localFile = open(contributions_zip_file, "wb")
     localFile.write(u.read())
     localFile.close()
 
-if not os.path.exists(contributions_txt_file) :
-    zip_file = zipfile.ZipFile(contributions_zip_file, 'r')
-    print('extracting %s' % contributions_zip_file)
+if not os.path.exists(contributions_txt_file):
+    zip_file = zipfile.ZipFile(contributions_zip_file, "r")
+    print("extracting %s" % contributions_zip_file)
     zip_file_contents = zip_file.namelist()
     for f in zip_file_contents:
-        if ('.txt' in f):
+        if ".txt" in f:
             zip_file.extract(f)
     zip_file.close()
 
 
-
-
-print('importing raw data from csv...')
-athenautils.drop_external_table("raw_table", 
-                                location = 's3://{}/{}'.format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'raw_table'),
-                                database=config.DATABASE)    
+print("importing raw data from csv...")
+athenautils.drop_external_table(
+    "raw_table",
+    location="s3://{}/{}".format(
+        config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY + "raw_table"
+    ),
+    database=config.DATABASE,
+)
 athenautils.athena_start_query("DROP TABLE IF EXISTS donors", database=config.DATABASE)
-athenautils.athena_start_query("DROP TABLE IF EXISTS recipients", database=config.DATABASE)
-athenautils.athena_start_query("DROP TABLE IF EXISTS contributions", database=config.DATABASE)
-athenautils.athena_start_query("DROP TABLE IF EXISTS processed_donors", database=config.DATABASE)
-
-
-q=r"""
-CREATE EXTERNAL TABLE raw_table 
-    (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), 
-    address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), 
-    state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), 
-    date_recieved VARCHAR(10), loan_amount VARCHAR(12), 
-    amount VARCHAR(23), receipt_type VARCHAR(23), 
-    employer VARCHAR(70), occupation VARCHAR(40), 
-    vendor_last_name VARCHAR(70), vendor_first_name VARCHAR(20), 
-    vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), 
-    vendor_city VARCHAR(20), vendor_state VARCHAR(10), 
-    vendor_zip VARCHAR(10), description VARCHAR(90), 
-    election_type VARCHAR(10), election_year VARCHAR(10), 
-    report_period_begin VARCHAR(10), report_period_end VARCHAR(33), 
-    committee_name VARCHAR(70), committee_id VARCHAR(37)) 
+athenautils.athena_start_query(
+    "DROP TABLE IF EXISTS recipients", database=config.DATABASE
+)
+athenautils.athena_start_query(
+    "DROP TABLE IF EXISTS contributions", database=config.DATABASE
+)
+athenautils.athena_start_query(
+    "DROP TABLE IF EXISTS processed_donors", database=config.DATABASE
+)
+
+
+q = r"""
+CREATE EXTERNAL TABLE raw_table
+    (reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35),
+    address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20),
+    state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24),
+    date_recieved VARCHAR(10), loan_amount VARCHAR(12),
+    amount VARCHAR(23), receipt_type VARCHAR(23),
+    employer VARCHAR(70), occupation VARCHAR(40),
+    vendor_last_name VARCHAR(70), vendor_first_name VARCHAR(20),
+    vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31),
+    vendor_city VARCHAR(20), vendor_state VARCHAR(10),
+    vendor_zip VARCHAR(10), description VARCHAR(90),
+    election_type VARCHAR(10), election_year VARCHAR(10),
+    report_period_begin VARCHAR(10), report_period_end VARCHAR(33),
+    committee_name VARCHAR(70), committee_id VARCHAR(37))
 ROW FORMAT DELIMITED
   FIELDS TERMINATED BY '\t'
   ESCAPED BY '\\'
-  LINES TERMINATED BY '\n'  
+  LINES TERMINATED BY '\n'
 LOCATION
-    's3://{}/{}' 
+    's3://{}/{}'
 TBLPROPERTIES (
-    'classification'='csv', 
-    'skip.header.line.count'='1',  
+    'classification'='csv',
+    'skip.header.line.count'='1',
     'serialization.null.format'='')
-""".format(config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY+'raw_table') 
+""".format(
+    config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY + "raw_table"
+)
 athenautils.athena_start_query(q, database=config.DATABASE)
 
 
-df_cursor = pd.read_csv(contributions_txt_file, sep='\t', escapechar='\\', quoting=csv.QUOTE_NONE,  
-                        error_bad_lines=False, warn_bad_lines=True, dtype=str, keep_default_na=False, na_values=[''],
-                        chunksize=config.BUFFERSIZE)
+df_cursor = pd.read_csv(
+    contributions_txt_file,
+    sep="\t",
+    escapechar="\\",
+    quoting=csv.QUOTE_NONE,
+    error_bad_lines=False,
+    warn_bad_lines=True,
+    dtype=str,
+    keep_default_na=False,
+    na_values=[""],
+    chunksize=config.BUFFERSIZE,
+)
 chunkcount = 0
-filename=os.path.join("s3://", config.DATABASE_BUCKET, config.DATABASE_ROOT_KEY,'raw_table', os.path.splitext(contributions_txt_file)[0]+'.csv')
-for df in df_cursor: 
-    # Remove the very few records that mess up the demo 
+filename = os.path.join(
+    "s3://",
+    config.DATABASE_BUCKET,
+    config.DATABASE_ROOT_KEY,
+    "raw_table",
+    os.path.splitext(contributions_txt_file)[0] + ".csv",
+)
+for df in df_cursor:
+    # Remove the very few records that mess up the demo
     # (demo purposes only! Don't do something like this in production)
-    df = df[df['RcvDate'].str.len()>=10]
+    df = df[df["RcvDate"].str.len() >= 10]
 
     # set empty, non-zero, strings in date columns to null
-    df.loc[df['RptPdBegDate'].str.len()<10,'RptPdBegDate'] = np.nan
+    df.loc[df["RptPdBegDate"].str.len() < 10, "RptPdBegDate"] = np.nan
 
-    df.loc[df['RptPdEndDate'].str.len()<10,'RptPdEndDate'] = np.nan
+    df.loc[df["RptPdEndDate"].str.len() < 10, "RptPdEndDate"] = np.nan
 
-    #committee ID is requred. Remove the 2 rows that don't have it.
-    df = df[df['ID']!='']
+    # committee ID is requred. Remove the 2 rows that don't have it.
+    df = df[df["ID"] != ""]
 
-    # There's a record with a date stuck in the committee_id column, which causes
-    # problems when inserting into the contributions table below. Get rid of it this 
-    # way.
-    df = df[df['ID'].str.len() <=9]
+    # There's a record with a date stuck in the committee_id column,
+    # which causes problems when inserting into the contributions table below.
+    # Get rid of it this way.
 
-    # dropping the last columns
-    df = df.drop(columns='Unnamed: 29')
+    df = df[df["ID"].str.len() <= 9]
 
-    df_lower=df.apply(lambda x: x.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') if x.dtype=='object' else x, result_type='expand')
-    
-    buffer = df_lower.to_csv(quoting=csv.QUOTE_NONE, sep="\t", escapechar='\\', index=None)
-    
-    chunk_fname = athenautils.file_name_append(filename, '_{}'.format(chunkcount), ommitext=False)
+    # dropping the last columns
+    df = df.drop(columns="Unnamed: 29")
+
+    df_lower = df.apply(
+        lambda x: x.str.lower()
+        .str.normalize("NFKD")
+        .str.encode("ascii", errors="ignore")
+        .str.decode("utf-8")
+        if x.dtype == "object"
+        else x,
+        result_type="expand",
+    )
+
+    buffer = df_lower.to_csv(
+        quoting=csv.QUOTE_NONE, sep="\t", escapechar="\\", index=None
+    )
+
+    chunk_fname = athenautils.file_name_append(
+        filename, "_{}".format(chunkcount), ommitext=False
+    )
     athenautils.write(body=buffer, filename=chunk_fname)
-    chunkcount += 1    
-    
-print('creating donors table...')
-q="""
+    chunkcount += 1
+
+print("creating donors table...")
+q = """
 CREATE TABLE donors as
     with tmp as
-      (SELECT DISTINCT 
-           NULLIF(TRIM(last_name), '') as last_name, 
-           NULLIF(TRIM(first_name), '') as first_name, 
-           NULLIF(TRIM(address_1), '') as address_1, 
-           NULLIF(TRIM(address_2), '') as address_2, 
-           NULLIF(TRIM(city), '') city, 
-           NULLIF(TRIM(state), '') as state, 
-           NULLIF(TRIM(zip), '') as zip, 
-           NULLIF(TRIM(employer), '') as employer, 
+      (SELECT DISTINCT
+           NULLIF(TRIM(last_name), '') as last_name,
+           NULLIF(TRIM(first_name), '') as first_name,
+           NULLIF(TRIM(address_1), '') as address_1,
+           NULLIF(TRIM(address_2), '') as address_2,
+           NULLIF(TRIM(city), '') city,
+           NULLIF(TRIM(state), '') as state,
+           NULLIF(TRIM(zip), '') as zip,
+           NULLIF(TRIM(employer), '') as employer,
            NULLIF(TRIM(occupation), '') as occupation
       FROM raw_table)
     SELECT row_number() over () as donor_id, * from tmp"""
 athenautils.athena_start_query(q, database=config.DATABASE)
 
 
-q="""
+q = """
 CREATE TABLE recipients as
     SELECT DISTINCT committee_id as recipient_id, committee_name as name FROM raw_table
 """
 athenautils.athena_start_query(q, database=config.DATABASE)
 
-print('creating contributions table')
+print("creating contributions table")
 
-q="""
+q = """
 CREATE TABLE contributions as
-    SELECT reciept_id as contribution_id, 
-        donors.donor_id as donor_id , 
-        committee_id as recipient_id, 
-        report_type, date_parse(date_recieved, '%m/%d/%Y') as date_recieved, 
-        loan_amount, amount, 
-        receipt_type, vendor_last_name , 
-        vendor_first_name, vendor_address_1, vendor_address_2, 
-        vendor_city, vendor_state, vendor_zip, description, 
-        election_type, election_year, 
-        date_parse(report_period_begin, '%m/%d/%Y') as report_period_begin, 
-        date_parse(report_period_end, '%m/%d/%Y') as report_period_end 
-    FROM raw_table JOIN donors donors ON 
-        coalesce(donors.first_name, '') = coalesce(TRIM(raw_table.first_name), '') AND 
-        coalesce(donors.last_name, '') = coalesce(TRIM(raw_table.last_name), '') AND 
-        coalesce(donors.address_1, '') = coalesce(TRIM(raw_table.address_1), '') AND 
-        coalesce(donors.address_2, '') = coalesce(TRIM(raw_table.address_2), '') AND 
-        coalesce(donors.city, '') = coalesce(TRIM(raw_table.city), '') AND 
-        coalesce(donors.state, '') = coalesce(TRIM(raw_table.state), '') AND 
-        coalesce(donors.employer, '') = coalesce(TRIM(raw_table.employer), '') AND 
-        coalesce(donors.occupation , '')= coalesce(TRIM(raw_table.occupation), '') AND 
+    SELECT reciept_id as contribution_id,
+        donors.donor_id as donor_id ,
+        committee_id as recipient_id,
+        report_type, date_parse(date_recieved, '%m/%d/%Y') as date_recieved,
+        loan_amount, amount,
+        receipt_type, vendor_last_name ,
+        vendor_first_name, vendor_address_1, vendor_address_2,
+        vendor_city, vendor_state, vendor_zip, description,
+        election_type, election_year,
+        date_parse(report_period_begin, '%m/%d/%Y') as report_period_begin,
+        date_parse(report_period_end, '%m/%d/%Y') as report_period_end
+    FROM raw_table JOIN donors donors ON
+        coalesce(donors.first_name, '') = coalesce(TRIM(raw_table.first_name), '') AND
+        coalesce(donors.last_name, '') = coalesce(TRIM(raw_table.last_name), '') AND
+        coalesce(donors.address_1, '') = coalesce(TRIM(raw_table.address_1), '') AND
+        coalesce(donors.address_2, '') = coalesce(TRIM(raw_table.address_2), '') AND
+        coalesce(donors.city, '') = coalesce(TRIM(raw_table.city), '') AND
+        coalesce(donors.state, '') = coalesce(TRIM(raw_table.state), '') AND
+        coalesce(donors.employer, '') = coalesce(TRIM(raw_table.employer), '') AND
+        coalesce(donors.occupation , '')= coalesce(TRIM(raw_table.occupation), '') AND
         coalesce(donors.zip, '') = coalesce(TRIM(raw_table.zip), '')"""
 
 athenautils.athena_start_query(q, database=config.DATABASE)
 
 q = """
-CREATE TABLE processed_donors AS  
-    SELECT donor_id,  
-     LOWER(city) AS city,  
-     CASE WHEN (first_name IS NULL AND last_name IS NULL) 
-          THEN NULL 
-          ELSE LOWER(array_join(filter(array[first_name, last_name], x-> x IS NOT NULL), ' ')) 
-     END AS name,  
-     LOWER(zip) AS zip,  
-     LOWER(state) AS state,  
-     CASE WHEN (address_1 IS NULL AND address_2 IS NULL) 
-          THEN NULL 
+CREATE TABLE processed_donors AS
+    SELECT donor_id,
+     LOWER(city) AS city,
+     CASE WHEN (first_name IS NULL AND last_name IS NULL)
+          THEN NULL
+          ELSE LOWER(array_join(filter(array[first_name, last_name], x-> x IS NOT NULL), ' '))
+     END AS name,
+     LOWER(zip) AS zip,
+     LOWER(state) AS state,
+     CASE WHEN (address_1 IS NULL AND address_2 IS NULL)
+          THEN NULL
           ELSE LOWER(array_join(filter(array[address_1, address_2], x-> x IS NOT NULL), ' '))
-     END AS address,  
-     LOWER(occupation) AS occupation, 
-     LOWER(employer) AS employer, 
-     first_name is null AS person 
+     END AS address,
+     LOWER(occupation) AS occupation,
+     LOWER(employer) AS employer,
+     first_name is null AS person
  FROM donors"""
 athenautils.athena_start_query(q, database=config.DATABASE)
 
 
-
-
-print('done')
+print("done")
diff --git a/athena_example/athenautils.py b/athena_example/athenautils.py
index a88463e2..9a68f367 100644
--- a/athena_example/athenautils.py
+++ b/athena_example/athenautils.py
@@ -1,71 +1,107 @@
 from __future__ import print_function
+import config
 import re
 import boto3
 import botocore
 import sys
-import datetime
 import os
 import time
+from os import listdir
+import shutil
 import pandas as pd
-from six import string_types
-import sys
+
+
 pyver = sys.version_info[0]
 
-if pyver<3:
+if pyver < 3:
     from StringIO import StringIO as SomethingIO
     from urlparse import urlparse
 else:
     from io import BytesIO as SomethingIO
     from urllib.parse import urlparse
-    
-sys.path.insert(0, '../athena_example/')
-import config
 
-s3 = boto3.client('s3', region_name=config.REGION, 
-                      aws_access_key_id=config.ACCESS_KEY_ID, aws_secret_access_key=config.SECRET_ACCESS_KEY)
-  
-athena = boto3.client('athena', region_name=config.REGION, 
-                      aws_access_key_id=config.ACCESS_KEY_ID, aws_secret_access_key=config.SECRET_ACCESS_KEY)
-
-def cursor_execute(query, database=None, cursortype='tuple', buffersize=1000000, 
-                   output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, 
-                   **kwargs):
-    
-    kwargs['chunksize']=buffersize
-    df_cur = athena_to_panda(query, database=database, 
-                             output_location=output_location, region=region, workgroup=workgroup, 
-                             **kwargs)
+sys.path.insert(0, "../athena_example/")
+
+s3 = boto3.client(
+    "s3",
+    region_name=config.REGION,
+    aws_access_key_id=config.ACCESS_KEY_ID,
+    aws_secret_access_key=config.SECRET_ACCESS_KEY,
+)
+
+athena = boto3.client(
+    "athena",
+    region_name=config.REGION,
+    aws_access_key_id=config.ACCESS_KEY_ID,
+    aws_secret_access_key=config.SECRET_ACCESS_KEY,
+)
+
+
+def cursor_execute(
+    query,
+    database=None,
+    cursortype="tuple",
+    buffersize=1000000,
+    output_location=config.ATHENA_GARBAGE_PATH,
+    region=config.REGION,
+    workgroup=config.WORKGROUP,
+    **kwargs
+):
+
+    kwargs["chunksize"] = buffersize
+    df_cur = athena_to_panda(
+        query,
+        database=database,
+        output_location=output_location,
+        region=region,
+        workgroup=workgroup,
+        **kwargs
+    )
     for df in df_cur:
-        if cursortype == 'dict':
-            all_rows = df.where(pd.notnull(df), None).to_dict('records')
-        if cursortype == 'tuple':
+        if cursortype == "dict":
+            all_rows = df.where(pd.notnull(df), None).to_dict("records")
+        if cursortype == "tuple":
             all_rows = df.where(pd.notnull(df), None).itertuples(index=False, name=None)
         for row in all_rows:
             yield row
-            
-            
-def athena_to_panda(query, database=None, 
-                    output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, 
-                    **kwargs):
-    query_execution_id = athena_start_query(query, database=database,
-                                            output_location=output_location, region=region, workgroup=workgroup,
-                                            wait_until_finished=True)
-    df = pandas_read_csv(os.path.join(output_location, query_execution_id+'.csv'), **kwargs)
+
+
+def athena_to_panda(
+    query,
+    database=None,
+    output_location=config.ATHENA_GARBAGE_PATH,
+    region=config.REGION,
+    workgroup=config.WORKGROUP,
+    **kwargs
+):
+    query_execution_id = athena_start_query(
+        query,
+        database=database,
+        output_location=output_location,
+        region=region,
+        workgroup=workgroup,
+        wait_until_finished=True,
+    )
+    df = pandas_read_csv(
+        os.path.join(output_location, query_execution_id + ".csv"), **kwargs
+    )
     return df
 
-def athena_start_query(query, database=None, 
-                       output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP, 
-                       wait_until_finished=True):
+
+def athena_start_query(
+    query,
+    database=None,
+    output_location=config.ATHENA_GARBAGE_PATH,
+    region=config.REGION,
+    workgroup=config.WORKGROUP,
+    wait_until_finished=True,
+):
     query_execution_id = athena.start_query_execution(
         QueryString=query,
-        QueryExecutionContext={
-            'Database': database
-        },    
+        QueryExecutionContext={"Database": database},
         WorkGroup=workgroup,
-        ResultConfiguration={
-            "OutputLocation": output_location
-        }
-    )['QueryExecutionId']
+        ResultConfiguration={"OutputLocation": output_location},
+    )["QueryExecutionId"]
 
     seconds_to_wait = 1
 
@@ -73,115 +109,140 @@ def athena_start_query(query, database=None,
         while True:
             time.sleep(seconds_to_wait)
             seconds_to_wait += 1
-#             seconds_to_wait *= 2
+            #             seconds_to_wait *= 2
 
-            execution = athena.get_query_execution(
-                QueryExecutionId=query_execution_id
-            )
+            execution = athena.get_query_execution(QueryExecutionId=query_execution_id)
 
-            if execution['QueryExecution']['Status']['State'] not in ['QUEUED', 'RUNNING']:
+            if execution["QueryExecution"]["Status"]["State"] not in [
+                "QUEUED",
+                "RUNNING",
+            ]:
                 break
 
-        if execution['QueryExecution']['Status']['State'] != 'SUCCEEDED':
-            raise Exception("Athena query failed: %s" % ( execution['QueryExecution']['Status']['StateChangeReason'],), query_execution_id)
+        if execution["QueryExecution"]["Status"]["State"] != "SUCCEEDED":
+            raise Exception(
+                "Athena query failed: %s"
+                % (execution["QueryExecution"]["Status"]["StateChangeReason"],),
+                query_execution_id,
+            )
 
     return query_execution_id
 
-# Copied from https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
+
+# Copied from
+# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
 # Import it instead, when it's updated.
+
+
 def is_s3_url(url):
     """Check for an s3, s3n, or s3a url"""
     try:
         return urlparse(url).scheme in ["s3", "s3n", "s3a"]
     except Exception:
         return False
-    
+
+
 def seperate_bucket_key(url):
-    m = re.match('s3://([^/]+)/(.*)', url)
+    m = re.match("s3://([^/]+)/(.*)", url)
     return m.group(1), m.group(2)
 
+
 def list_all(path):
     if is_s3_url(path):
         bucket, key = seperate_bucket_key(path)
         objects = s3.list_objects_v2(Bucket=bucket, Prefix=key)
-        if not 'Contents' in objects:
+        if "Contents" not in objects:
             return []
-        return [key['Key'] for key in objects['Contents']]
-    from os import listdir
-    from os.path import isfile, join
+        return [key["Key"] for key in objects["Contents"]]
     if not os.path.exists(path):
         return []
     return listdir(path)
 
+
 def del_all_files(path):
     filelist = list_all(path)
     if is_s3_url(path):
         bucket, key = seperate_bucket_key(path)
         for f in filelist:
-            s3.delete_object(Bucket=bucket, Key=f)    
+            s3.delete_object(Bucket=bucket, Key=f)
         return
     filelist = [os.path.join(path, f) for f in filelist]
     for f in filelist:
         if os.path.isfile(f):
             os.remove(f)
-        else:    
-            shutil.rmtree(f)  
-            
-def drop_external_table(tablename, location , database=None, 
-                        output_location=config.ATHENA_GARBAGE_PATH, region=config.REGION, workgroup=config.WORKGROUP):
-    athena_start_query('drop table if exists {}'.format(tablename), database=database, 
-                       output_location=output_location, region=region, workgroup=workgroup)
+        else:
+            shutil.rmtree(f)
+
+
+def drop_external_table(
+    tablename,
+    location,
+    database=None,
+    output_location=config.ATHENA_GARBAGE_PATH,
+    region=config.REGION,
+    workgroup=config.WORKGROUP,
+):
+    athena_start_query(
+        "drop table if exists {}".format(tablename),
+        database=database,
+        output_location=output_location,
+        region=region,
+        workgroup=workgroup,
+    )
     del_all_files(location)
 
 
-    
-
 def pandas_read_csv(filepath_or_buffer, **kwargs):
     bucket, key = seperate_bucket_key(filepath_or_buffer)
     obj = s3.get_object(Bucket=bucket, Key=key)
-    return pd.read_csv(SomethingIO(obj['Body'].read()),  **kwargs)
+    return pd.read_csv(SomethingIO(obj["Body"].read()), **kwargs)
+
 
 def read(filename):
     if is_s3_url(filename):
         bucket, key = seperate_bucket_key(filename)
-        obj=s3.get_object(Bucket=bucket, Key=key)
-        return obj['Body'].read()
-    with open (filename) as f:
+        obj = s3.get_object(Bucket=bucket, Key=key)
+        return obj["Body"].read()
+    with open(filename) as f:
         return f.read()
 
+
 def write(body, filename):
     bucket, key = seperate_bucket_key(filename)
     s3.put_object(Bucket=bucket, Key=key, Body=body)
     return
-        
+
 
 def file_name_append(filename, append, ommitext):
-    filename_base, ext  = os.path.splitext(filename)
-    if ommitext: 
-        return '%s%s' % (filename_base, append)
-    return '%s%s%s' % (filename_base, append, ext)
+    filename_base, ext = os.path.splitext(filename)
+    if ommitext:
+        return "%s%s" % (filename_base, append)
+    return "%s%s%s" % (filename_base, append, ext)
+
 
 def write_many(read_cursor, filename, buffersize=config.BUFFERSIZE):
-    chunkcount=0
+    chunkcount = 0
     while True:
         buffer_df = pd.DataFrame.from_records(read_cursor, nrows=buffersize)
-        if buffer_df.empty: 
-            break        
-        buffer = buffer_df.to_csv(index=False, header=False, sep='\t')
-        chunk_fname = file_name_append(filename, '_{}'.format(chunkcount), ommitext=False)
+        if buffer_df.empty:
+            break
+        buffer = buffer_df.to_csv(index=False, header=False, sep="\t")
+        chunk_fname = file_name_append(
+            filename, "_{}".format(chunkcount), ommitext=False
+        )
         write(buffer, chunk_fname)
         chunkcount += 1
-        
+
+
 def file_exists(filename):
     bucket, key = seperate_bucket_key(filename)
     try:
         s3.get_object(Bucket=bucket, Key=key)
     except botocore.exceptions.ClientError as e:
-        if e.response['Error']['Code']=='NoSuchKey':
+        if e.response["Error"]["Code"] == "NoSuchKey":
             return False
         else:
             # Something else has gone wrong.
             raise
     else:
         return True
-