Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.env
211 changes: 211 additions & 0 deletions PYTHON_TEST.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
# 1) The Dev Team was tired of developing the same old queries just varying the filters accordingly to their boss demands.
# As a new member of the crew, your mission now is to create a dynamic function in Python, on the most flexible of
# ways, to produce queries and retrieve a dataframe based on three parameters:

# product_code: integer
# store_code: integer
# date: list of ISO-like strings

# Date e.g.
# ['2019-01-01', '2019-01-31']
# It should look like this my_data = retrieve_data(product_code, store_code, date)

# Extra instructions:

# Retrieve all columns from table data_product_sales;
# Imagine people from other teams will also utilize this function!

from dotenv import load_dotenv
import mysql.connector
import pandas as pd
import os

load_dotenv()

# Pesquisei na documentação do mysql disponível no repositório do desafio o template da conexão
def connect_db():
cnx = mysql.connector.connect(
user=os.getenv("USER"),
password=os.getenv("PASSWORD"),
host=os.getenv("HOST"),
port=os.getenv("PORT"),
database=os.getenv("DATABASE")
)

return cnx

def retrieve_data(product_code: int, store_code: int, date: list) -> pd.DataFrame:
cnx = connect_db()

query = '''
SELECT *
FROM data_product_sales
WHERE PRODUCT_CODE = %s
AND STORE_CODE = %s
AND DATE BETWEEN %s AND %s
ORDER BY DATE DESC;
'''

data = pd.read_sql(query, params=(product_code, store_code, date[0], date[1]), con=cnx)
cnx.close()

return data

# 2) A brand new client sent you two ready-to-go queries. Those are listed below:

# Query 1:
# SELECT
# STORE_CODE,
# STORE_NAME,
# START_DATE,
# END_DATE,
# BUSINESS_NAME,
# BUSINESS_CODE
# FROM data_store_cad

# Query 2:
# SELECT
# STORE_CODE,
# DATE,
# SALES_VALUE,
# SALES_QTY
# FROM data_store_sales
# WHERE DATE BETWEEN '2019-01-01' AND '2019-12-31'
# In addition, he gave you this set of instructions:

# Use the queries as they are (do not modify them or create a new one);

# Please filter the period between this given range:
# ['2019-10-01','2019-12-31']

# We are in need of this visualization (click here to see it)! Please, create it with Python

# Loja Categoria TM
# Bahia Atacado 15.39
# Bangkok Posto 13.67
# Belem Proximidade 15.37
# ...

def tm_viz():
cnx = connect_db()

query1 = '''
SELECT
STORE_CODE,
STORE_NAME,
START_DATE,
END_DATE,
BUSINESS_NAME,
BUSINESS_CODE
FROM data_store_cad
'''

query2 = '''
SELECT
STORE_CODE,
DATE,
SALES_VALUE,
SALES_QTY
FROM data_store_sales
WHERE DATE BETWEEN '2019-01-01' AND '2019-12-31'
'''

table_store_cad = pd.read_sql(query1, con=cnx)
table_store_sales = pd.read_sql(query2, con=cnx)
cnx.close()

data = pd.merge(left=table_store_sales, right=table_store_cad, how='left', on='STORE_CODE').groupby(['STORE_NAME', 'BUSINESS_NAME']).agg({
'SALES_VALUE': 'sum',
'SALES_QTY': 'sum'
}).reset_index()

data['TM'] = (data['SALES_VALUE'] / data['SALES_QTY']).round(2)
data = data[['STORE_NAME', 'BUSINESS_NAME', 'TM']].rename(columns={'STORE_NAME': 'Loja', 'BUSINESS_NAME': 'Categoria'})

return data

# 3) Building your own visualization
# Create at least one chart using the table IMDB_movies. The code must be in Python, and you are free to use any
# libraries, data in the table and graphic format. Explain why you chose the visualization (or visualizations) you are
# submitting.

import matplotlib.pyplot as plt
import seaborn as sns

def imdb_viz():
cnx = connect_db()

query = '''
SELECT *
FROM IMDB_movies
'''

data = pd.read_sql(query, con=cnx)
cnx.close()

#--------------------------------------------------------
# Primeira viz
#--------------------------------------------------------

# Escolhi essa visualização para mostrar a relação das notas com o tempo dos filmes.
# É uma forma interessante de mostrar que filmes antigos que tinham maior duração,
# tinham notas mais altas, sugerindo que uma maior duração permite um melhor desenvolvimento
# da história e dos personagens, deixando uma percepção de maior qualidade resultando em maiores notas.

# Vou utilizar a coluna Rating como a nota padrão porque é o público geral quem consome e traz a
# receita do filme, não a crítica (MetaScore)

rating_mean = data.groupby('Year')['Rating'].mean()
runtime_mean = data.groupby('Year')['Runtime'].mean()

# Pesquisei no Gemini como utilizar dois eixos no mesmo gráfico, foi sugerido a estrutura com subplots (ax1 e ax2)
fig, ax1 = plt.subplots(figsize=(12, 6))
color = "tab:blue"
ax1.set_xlabel("Ano")
ax1.set_ylabel("Nota Média (Rating)", color=color)
ax1.plot(rating_mean.index, rating_mean.values, color=color, marker="o")

ax2 = ax1.twinx()
color = "tab:red"
ax2.set_ylabel("Duração Média (Minutos)", color=color)
ax2.plot(runtime_mean.index, runtime_mean.values, color=color, marker="D")

plt.title("Comparação da Nota Média vs Duração Média dos Filmes ao Longo dos Anos", fontweight="bold")
plt.savefig("chart1.png", dpi=300, bbox_inches="tight")
plt.show()

#--------------------------------------------------------
# Segunda viz
#--------------------------------------------------------

# Escolhi essa visualização por ajudar a direcionar qual gênero será escolhido para produzir um filme,
# ajudando a ver um panorama geral do mercado.

# Utilizei o Gemini para sugerir formas de separar os gêneros
genres = data['Genre'].str.get_dummies(",")
df_final = pd.concat([data, genres], axis=1).drop(columns=["Genre"])

revenue_by_genre = df_final[genres.columns].multiply(df_final["RevenueMillions"], axis=0).sum().sort_values()

ax = revenue_by_genre.plot(kind="barh", color="skyblue", figsize=(12, 6))
ax.bar_label(ax.containers[0], fontsize=10)

plt.title('Receita por Gênero', fontweight="bold")
plt.xlabel('Receita (Milhões $)')
plt.ylabel('Gênero')
plt.tight_layout()
plt.savefig("chart2.png", dpi=300, bbox_inches="tight")
plt.show()

if __name__ == "__main__":

# Case 1
data = retrieve_data(21, 9, ['2019-01-01', '2019-01-31'])
print(data)

# Case 2
viz = tm_viz()
print(viz)

# Case 3
imdb_viz()
19 changes: 19 additions & 0 deletions SQL_TEST.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#What are the 10 most expensive products in the company?
SELECT PRODUCT_NAME, PRODUCT_VAL
FROM data_product
ORDER BY PRODUCT_VAL DESC
LIMIT 10;

#What sections do the 'BEBIDAS' and 'PADARIA' departments have?
SELECT DISTINCT SECTION_NAME
FROM data_product
WHERE DEP_NAME IN ('BEBIDAS', 'PADARIA');

#What was the total sale of products (in $) of each Business Area in the first quarter of 2019?
SELECT sc.BUSINESS_NAME, SUM(ps.SALES_VALUE * ps.SALES_QTY) AS TOTAL_SALES
FROM data_product_sales AS ps
LEFT JOIN data_store_cad AS sc
ON ps.STORE_CODE = sc.STORE_CODE
WHERE ps.DATE BETWEEN '2019-01-01' AND '2019-03-31'
GROUP BY sc.BUSINESS_NAME
ORDER BY TOTAL_SALES DESC;
Binary file added chart1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added chart2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.