! " , Power BI, Python" (). , . . . , , .
"" Python Power BI? Python/R c Power BI (Tableau, Qlik) 70-80% . . , SQL. , .
SQL PostgreSQL. . , . , , c PostgreSQL, Docker.
csv GitHub (). , pgAdmin. 1 4-5 . , Python. PostgreSQL - , .
sales. - .
import psycopg2
#
conn = psycopg2.connect("dbname='db' user='postgres' password='gfhjkm' host='localhost' port='5432'")
print("Database opened successfully")
#
cursor = conn.cursor()
with conn:
cursor.execute("""
DROP TABLE IF EXISTS sales;
""")
cursor.execute("""
CREATE TABLE IF NOT EXISTS sales (
id SERIAL PRIMARY KEY,
date DATE NOT NULL,
promo TEXT NOT NULL,
site TEXT NOT NULL,
user_id TEXT NOT NULL,
transaction_id INTEGER NOT NULL,
amount INTEGER NOT NULL);
""")
print("Operation done successfully")
#
cursor.close()
conn.close()
, . Pandas sqlalchemy . datetime.
import os
import pandas as pd
import psycopg2
from sqlalchemy import create_engine
from datetime import datetime
start_time = datetime.now()
#
engine = create_engine('postgresql://postgres:gfhjkm@localhost:5432/db')
print("Database opened successfully")
#
path_to_data = "C:/Users/Pavel/PycharmProjects/database/"
#
sale_records = pd.read_csv(os.path.join(path_to_data, "ohortAnalysis_2016_2018.csv"),
sep=";", parse_dates=["date"], dayfirst=True)
postgresql_table = "sales"
#
sale_records.to_sql(postgresql_table, engine, if_exists='append', index=False)
print("Operation done successfully")
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))
3 26 . . , sqlalchemy .
import psycopg2
from datetime import datetime
start_time = datetime.now()
#
conn = psycopg2.connect("dbname='db' user='postgres' password='gfhjkm' host='localhost' port='5432'")
print("Database opened successfully")
#
cursor = conn.cursor()
#
path_to_data = "C:/Users/Pavel/PycharmProjects/database/"
#
sale_records = pd.read_csv(os.path.join(path_to_data, "ohortAnalysis_2016_2018.csv"),
sep=";", parse_dates=["date"], dayfirst=True)
query = "INSERT INTO sales (date, promo, site, user_id, transaction_id, amount) values (%s, %s, %s, %s, %s, %s)"
dataset_for_db = sale_records.values.tolist()
cursor.executemany(query, dataset_for_db)
conn.commit()
print("Operation done successfully")
#
cursor.close()
conn.close()
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))
10 . – pandas.
import psycopg2
from datetime import datetime
start_time = datetime.now()
#
conn = psycopg2.connect("dbname='db' user='postgres' password='gfhjkm' host='localhost' port='5432'")
print("Database opened successfully")
#
cursor = conn.cursor()
# .
with open('ohortAnalysis_2016_2018.csv', 'r', encoding='UTF8') as f:
next(f)
cursor.copy_from(f, 'sales', sep=';', columns=('date','promo','site','user_id','transaction_id','amount'))
conn.commit()
f.close()
print("Operation done successfully")
#
cursor.close()
conn.close()
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))
7 . . . . , pandas .
SQL , . . Python Power BI , . SQL .
SELECT s3.date,
s3.user_id,
s3.date - s2.first_date AS delta_days,
ceil((s3.date - s2.first_date)::real/30::real)*30 AS cohort_days,
to_char(s2.first_date,'YYYY-MM') AS first_transaction
s3.amount
FROM public.sales AS s3
LEFT JOIN
(SELECT s1.user_id,
MIN(s1.date) AS first_date
FROM public.sales AS s1
GROUP BY s1.user_id) AS s2 ON s3.user_id = s2.user_id
ORDER BY s3.user_id,
s3.date
SELECT s.date,
s.user_id,
s.date - FIRST_VALUE(s.date) OVER(PARTITION BY s.user_id ORDER BY s.date) AS delta_days,
ceil((s.date - FIRST_VALUE(s.date) OVER(PARTITION BY s.user_id ORDER BY s.date))::real/30::real)*30 AS cohort_days,
to_char(FIRST_VALUE(s.date) OVER(PARTITION BY s.user_id ORDER BY s.date),'YYYY-MM') AS first_transaction,
s.amount
FROM public.sales AS s
ORDER BY s.user_id,
s.date
, , , . . PostgreSQL to_char().
( ) . - , CASE. , 3 . . , , . PostgreSQL . - .
. – 30 . 30. 0 30, 0, . 0 30 30. , . , 30 30, 1, . , , . PostgreSQL ceil(). 30 .
. INTEGER INTEGER, . ! , ::real .
: SQL .
, .
, .
SELECT r2.first_transaction,
r2.cohort_days,
--r2.total_amount,
--sum(r2.total_amount) OVER (PARTITION BY r2.first_transaction ORDER BY r2.first_transaction, r2.cohort_days) as cumsum_amount,
--first_value(r2.total_amount) OVER (PARTITION BY r2.first_transaction ORDER BY r2.first_transaction, r2.cohort_days) as first_total_amount,
round((sum(r2.total_amount) OVER (PARTITION BY r2.first_transaction ORDER BY r2.first_transaction, r2.cohort_days)/
first_value(r2.total_amount) OVER (PARTITION BY r2.first_transaction ORDER BY r2.first_transaction, r2.cohort_days)-1),3) as percent_cumsum_amount
FROM
(SELECT r.first_transaction, r.cohort_days, sum(r.amount) AS total_amount
FROM public.report_cohort_analysis AS r
GROUP BY r.first_transaction, r.cohort_days
ORDER BY r.first_transaction, r.cohort_days) as r2
, .
. – SQL. PostgreSQL CROSSTAB, . BI . Power BI , ( , Python). ( SQL). .
. , .
. , !