From 8457b91fae682561bec6e57152a31d2a6350cae9 Mon Sep 17 00:00:00 2001 From: Martin Bolot Date: Fri, 7 Dec 2018 22:03:33 +0100 Subject: [PATCH 1/9] ajout big data --- .gitignore | 2 + CPE-Lyon/Big Data/TODO.md | 7 + CPE-Lyon/Big Data/pipeline_taxi.sql | 138 ++++++++++++++ CPE-Lyon/Big Data/pipeline_violations.sql | 180 ++++++++++++++++++ .../scripts_violations/create_refine1.sql | 52 +++++ .../create_refine1_nbviolations.sql | 21 ++ .../scripts_violations/create_refine2.sql | 29 +++ .../create_refine2_nbviolations.sql | 22 +++ .../scripts_violations/create_refine3.sql | 22 +++ .../scripts_violations/create_refine4.sql | 38 ++++ .../scripts_violations/create_results.sql | 14 ++ 11 files changed, 525 insertions(+) create mode 100644 CPE-Lyon/Big Data/TODO.md create mode 100644 CPE-Lyon/Big Data/pipeline_taxi.sql create mode 100644 CPE-Lyon/Big Data/pipeline_violations.sql create mode 100644 CPE-Lyon/Big Data/scripts_violations/create_refine1.sql create mode 100644 CPE-Lyon/Big Data/scripts_violations/create_refine1_nbviolations.sql create mode 100644 CPE-Lyon/Big Data/scripts_violations/create_refine2.sql create mode 100644 CPE-Lyon/Big Data/scripts_violations/create_refine2_nbviolations.sql create mode 100644 CPE-Lyon/Big Data/scripts_violations/create_refine3.sql create mode 100644 CPE-Lyon/Big Data/scripts_violations/create_refine4.sql create mode 100644 CPE-Lyon/Big Data/scripts_violations/create_results.sql diff --git a/.gitignore b/.gitignore index 66791da..f168b6b 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ ./COO Design Pattern/*/.project CPE-Lyon/JEE2/.idea/* #CPE-Lyon/JEE2/*/target/* + +CPE-Lyon/Big\ Data/connect.sh diff --git a/CPE-Lyon/Big Data/TODO.md b/CPE-Lyon/Big Data/TODO.md new file mode 100644 index 0000000..489d18a --- /dev/null +++ b/CPE-Lyon/Big Data/TODO.md @@ -0,0 +1,7 @@ +- Créer tables sans external (internal par défaut) +- Ajouter les données des contraventions de l'année fiscale 2017 +- Ajouter une étape de suppression des valeurs aberrantes +- Faire des scripts hive +- Harmoniser les noms de tables +- Recalculer les agrégats et les utiliser dans tableau +- Ajouter la géoloc diff --git a/CPE-Lyon/Big Data/pipeline_taxi.sql b/CPE-Lyon/Big Data/pipeline_taxi.sql new file mode 100644 index 0000000..d637464 --- /dev/null +++ b/CPE-Lyon/Big Data/pipeline_taxi.sql @@ -0,0 +1,138 @@ +-- create +create external table raw_taxi ( + VendorID int, + tpep_pickup_datetime string, + tpep_dropoff_datetime string, + passenger_count int, + trip_distance int, + pickup_longitude decimal, + pickup_latitude decimal, + RatecodeID int, + store_and_fwd_flag string, + dropoff_longitude decimal, + dropoff_latitude decimal, + payment_type int, + fare_amount int, + extra int, + mta_tax int, + tip_amount int, + tolls_amount int, + improvement_surcharge int, + total_amount int, + PULocationID int, + DOLocationID int +) +row format delimited fields terminated by ',' +stored as textfile +location '/user/formation35/taxi/' +tblproperties ("skip.header.line.count"="1"); + +-- reduce the number of rows +create table refine1_taxi( + VendorID int, + tpep_pickup_datetime string, + tpep_dropoff_datetime string, + passenger_count int, + trip_distance int, + pickup_longitude decimal, + pickup_latitude decimal, + RatecodeID int, + store_and_fwd_flag string, + dropoff_longitude decimal, + dropoff_latitude decimal, + payment_type int, + fare_amount int, + extra int, + mta_tax int, + tip_amount int, + tolls_amount int, + improvement_surcharge int, + total_amount int, + PULocationID int, + DOLocationID int +) +stored as orc; +insert into table refine1_taxi +select * +from raw_taxi limit 100000; + +-- reduce the number of columns +create table refine2_taxi( + tpep_pickup_datetime string, + tpep_dropoff_datetime string, + PULocationID int, + DOLocationID int +) +stored as orc; +insert into table refine2_taxi +select tpep_pickup_datetime, tpep_dropoff_datetime, PULocationID, DOLocationID +from refine1_taxi; + +-- clean (remove null pk) +create table refine3_taxi( + tpep_pickup_datetime string, + tpep_dropoff_datetime string, + PULocationID int, + DOLocationID int +) +stored as orc; +insert into table refine3_taxi +select * +from refine2_taxi +where tpep_dropoff_datetime is not null; + +-- format +create table refine4_taxi( + tpep_pickup_datetime date, + tpep_dropoff_datetime date, + PULocationID int, + DOLocationID int +) +stored as orc; +insert into table refine4_taxi +select + to_date(from_unixtime(UNIX_TIMESTAMP(SUBSTR(tpep_pickup_datetime,0,10), 'MM/dd/yyyy'))), + to_date(from_unixtime(UNIX_TIMESTAMP(SUBSTR(tpep_dropoff_datetime,0,10), 'MM/dd/yyyy'))), + PULocationID, + DOLocationID +from refine3_taxi; + +-- agregate +create table refine1_nbtaxi( + taxi_ride_date date, + nb_pu int, + nb_do int +) +stored as orc; +insert into table refine1_nbtaxi +select + tpep_pickup_datetime, + count(tpep_pickup_datetime) as nb_pu, + count(tpep_dropoff_datetime) as nb_do +from refine4_taxi +group by tpep_pickup_datetime; + +/* +// normalize [select (value-MIN) / MAX-MIN)] + +//