Повторяем когортный анализ. Комплексный подход — Python, SQL, Power BI

! " , Power BI, Python" (). , . . . , , .





"" Python Power BI? Python/R c Power BI (Tableau, Qlik) 70-80% . . , SQL. , .





SQL PostgreSQL. . , . , , c PostgreSQL, Docker.





csv GitHub (). , pgAdmin. 1 4-5 . , Python. PostgreSQL - , .





sales. - .





import psycopg2

#   
conn = psycopg2.connect("dbname='db' user='postgres' password='gfhjkm' host='localhost' port='5432'")

print("Database opened successfully")

#  
cursor = conn.cursor()

with conn:
    cursor.execute("""
            DROP TABLE IF EXISTS sales;
        """)

    cursor.execute("""
            CREATE TABLE IF NOT EXISTS sales (
              id SERIAL PRIMARY KEY,
              date DATE NOT NULL, 
              promo TEXT NOT NULL,
              site TEXT NOT NULL,
              user_id TEXT NOT NULL,
              transaction_id INTEGER NOT NULL,
              amount INTEGER NOT NULL);
        """)


print("Operation done successfully")

#    
cursor.close()
conn.close()
      
      



, . Pandas sqlalchemy . datetime.





import os
import pandas as pd
import psycopg2
from sqlalchemy import create_engine
from datetime import datetime

start_time = datetime.now()

#   
engine = create_engine('postgresql://postgres:gfhjkm@localhost:5432/db')

print("Database opened successfully")

#     
path_to_data = "C:/Users/Pavel/PycharmProjects/database/"
#    
sale_records = pd.read_csv(os.path.join(path_to_data, "ohortAnalysis_2016_2018.csv"),
                           sep=";", parse_dates=["date"], dayfirst=True)
postgresql_table = "sales"
#    
sale_records.to_sql(postgresql_table, engine, if_exists='append', index=False)

print("Operation done successfully")

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))
      
      



3 26 . . , sqlalchemy .





import psycopg2
from datetime import datetime

start_time = datetime.now()

#   
conn = psycopg2.connect("dbname='db' user='postgres' password='gfhjkm' host='localhost' port='5432'")
print("Database opened successfully")

#  
cursor = conn.cursor()

#     
path_to_data = "C:/Users/Pavel/PycharmProjects/database/"
#    
sale_records = pd.read_csv(os.path.join(path_to_data, "ohortAnalysis_2016_2018.csv"),
                           sep=";", parse_dates=["date"], dayfirst=True)

query = "INSERT INTO sales (date, promo, site, user_id, transaction_id, amount) values (%s, %s, %s, %s, %s, %s)"
dataset_for_db = sale_records.values.tolist()

cursor.executemany(query, dataset_for_db)
conn.commit()

print("Operation done successfully")

#    
cursor.close()
conn.close()

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))
      
      



10 . – pandas.





import psycopg2
from datetime import datetime

start_time = datetime.now()

#   
conn = psycopg2.connect("dbname='db' user='postgres' password='gfhjkm' host='localhost' port='5432'")
print("Database opened successfully")

#  
cursor = conn.cursor()


#  .       
with open('ohortAnalysis_2016_2018.csv', 'r', encoding='UTF8') as f:
    next(f)
    cursor.copy_from(f, 'sales', sep=';', columns=('date','promo','site','user_id','transaction_id','amount'))
    conn.commit()

f.close()

print("Operation done successfully")

#    
cursor.close()
conn.close()

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))
      
      



7 . . . . , pandas .





SQL , . . Python Power BI , . SQL .





SELECT s3.date,
	s3.user_id,
	s3.date - s2.first_date AS delta_days,
	ceil((s3.date - s2.first_date)::real/30::real)*30 AS cohort_days,
	to_char(s2.first_date,'YYYY-MM') AS first_transaction
	s3.amount
FROM public.sales AS s3
LEFT JOIN
				(SELECT s1.user_id,
						MIN(s1.date) AS first_date
					FROM public.sales AS s1
					GROUP BY s1.user_id) AS s2 ON s3.user_id = s2.user_id
ORDER BY s3.user_id,
	s3.date


SELECT  s.date,
		s.user_id,
		s.date - FIRST_VALUE(s.date) OVER(PARTITION BY s.user_id ORDER BY s.date) AS delta_days,
		ceil((s.date - FIRST_VALUE(s.date) OVER(PARTITION BY s.user_id ORDER BY s.date))::real/30::real)*30 AS cohort_days,
		to_char(FIRST_VALUE(s.date) OVER(PARTITION BY s.user_id ORDER BY s.date),'YYYY-MM') AS first_transaction,
		s.amount
FROM public.sales AS s
ORDER BY s.user_id,
	s.date
      
      



, , , . . PostgreSQL to_char().





( ) . - , CASE. , 3 . . , , . PostgreSQL . - .





. – 30 . 30. 0 30, 0, . 0 30 30. , . , 30 30, 1, . , , . PostgreSQL ceil(). 30 .





. INTEGER INTEGER, . ! , ::real .





: SQL .





, .





, .





SELECT r2.first_transaction,
		r2.cohort_days,
		--r2.total_amount,
		--sum(r2.total_amount) OVER (PARTITION BY r2.first_transaction ORDER BY r2.first_transaction, r2.cohort_days) as cumsum_amount,
		--first_value(r2.total_amount) OVER (PARTITION BY r2.first_transaction ORDER BY r2.first_transaction, r2.cohort_days) as first_total_amount,
		round((sum(r2.total_amount) OVER (PARTITION BY r2.first_transaction ORDER BY r2.first_transaction, r2.cohort_days)/ 
		first_value(r2.total_amount) OVER (PARTITION BY r2.first_transaction ORDER BY r2.first_transaction, r2.cohort_days)-1),3) as percent_cumsum_amount
FROM 
		(SELECT r.first_transaction, r.cohort_days, sum(r.amount) AS total_amount		
		FROM public.report_cohort_analysis AS r
		GROUP BY r.first_transaction, r.cohort_days
		ORDER BY r.first_transaction, r.cohort_days) as r2
      
      



, . , . (- ). - . ().





, .





. – SQL. PostgreSQL CROSSTAB, . BI . Power BI , ( , Python). ( SQL). .





. , .





. , !








All Articles