[tool.jupysql.SqlMagic]
displaycon = false
feedback = true
autopandas  = true
named_parameters = true

import urllib.request
import zipfile

# Download the ZIP file
zip_file_path, _ = urllib.request.urlretrieve(<data-download-url>)
# Extract the ZIP file
with zipfile.ZipFile(<destination-folder>, "r") as zip_ref:
    zip_ref.extractall(self.output_folder)

tasks:
  - source: etl/extractdata.py
    product:
      nb: products/extract-pipeline.ipynb

pipeline/
├──pipeline.yaml
├──pyproject.toml
├──README.md
├──etl/
├──├── extractdata.py
├──products/

tasks:
  - source: etl/extractdata.py
    product:
      nb: products/extract-pipeline.ipynb

$ cd pipeline/
$ poetry run ploomber build

tasks:
  - source: etl/extractdata.py
    product:
      nb: products/extract-pipeline.ipynb
  - source: etl/wrangle-data.ipynb
    product:
      nb: products/wrangle-data-pipeline.ipynb
      data: etl/bank_data.duck.db

$ poetry run ploomber build

Executing:  56%|████████████████████                | 5/9 [00:03<00:02,  1.44cell/s]
Building task 'wrangle-data': 100%|███████████████████| 1/1 [00:03<00:00,  3.49s/it]
name          Ran?      Elapsed (s)    Percentage
------------  ------  -------------  ------------
wrangle-data  True          3.48937           100
extractdata   False         0                   0

# Loading in SQL extension
%reload_ext sql
# Initiating a DuckDB database named 'bank_data.duck.db' to run SQL queries
%sql duckdb:///../bank_data.duck.db

%sqlcmd tables

"""
client_account_district
account_trans_order
"""
%sqlcmd profile -t account_trans_order

%%sql
SELECT DISTINCT district_name, no_of_entrepreneurs_per_1000_inhabitants
FROM client_account_district
ORDER BY no_of_entrepreneurs_per_1000_inhabitants DESC
LIMIT 1;

%%sql --save average_salary --no-execute
SELECT region, AVG(average_salary) AS avg_salary
FROM client_account_district
GROUP BY region
ORDER BY avg_salary DESC

import seaborn as sns
import matplotlib.pyplot as plt
avg_sal = %sql SELECT * FROM average_salary
sns.barplot(x=avg_sal['region'], y=avg_sal['avg_salary'])
plt.xticks(rotation=45)
plt.title("Average salary by region")
plt.show()

%%sql --save client_frequency --no-execute
SELECT region, COUNT(DISTINCT client_id) AS num_clients
FROM client_account_district
GROUP BY region;

client_freq = %sql SELECT * FROM client_frequency
client_freq.sort_values(by='num_clients', inplace=True)
sns.barplot(x=client_freq['region'], y=client_freq['num_clients'])
plt.xticks(rotation=45)
plt.title("Number of clients in each region")
plt.show()

%%sql --save avg_dist_avg_transaction --no-execute
SELECT 
    cad.region,
    AVG(cad.average_salary) AS avg_district_salary,
    AVG(ato.transaction_amount) AS avg_transaction_amount
FROM client_account_district cad
JOIN account_trans_order ato ON cad.account_id = ato.account_id
GROUP BY cad.region
ORDER BY avg_district_salary, avg_transaction_amount;

avg_d_avg_tr = %sql SELECT * FROM avg_dist_avg_transaction
avg_d_avg_tr.head()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

sns.scatterplot(data=avg_d_avg_tr, 
                x = "avg_district_salary", 
                y = "avg_transaction_amount",
               hue='region')
plt.xlabel("average district salary")
plt.ylabel("average transaction amount")
plt.show()

name: On Push/Pull Workflow to the main branch

on:
  push:
    branches: [ "main" ]
  pull_request:
    branches: [ "main" ]

permissions:
  contents: read

jobs:
  build:
    runs-on: ubuntu-latest

    steps:
    - uses: actions/checkout@v3
    - name: Set up Python 3.10
      uses: actions/setup-python@v3
      with:
        python-version: "3.10"

name: Scheduled Workflow

on:
  schedule:
    - cron: '0 2 * * *'  # This means 2 AM every day

jobs:
  routine-task:
    runs-on: ubuntu-latest

    steps:
    - name: Checkout code
      uses: actions/checkout@v3

    - name: Execute routine task
      run: |
        # Your commands for the routine task

import pytest
from unittest.mock import patch, Mock
import unittest
from pipeline.etl.extractdata import MarketData  

@pytest.fixture
def market_data():
    url = "https://tinyurl.com/jb-bank-m"
    output_folder = "test_output"
    return MarketData(url, output_folder)

def test_extract(market_data):
    with patch('urllib.request.urlretrieve', return_value=('path_to_zip', None)) as MockUrlRetrieve, \
         patch('zipfile.ZipFile') as MockZip:

        mock_zip = Mock()
        MockZip.return_value.__enter__.return_value = mock_zip

        result = market_data.extract()

        MockUrlRetrieve.assert_called_once_with(market_data.url)
        MockZip.assert_called_once_with('path_to_zip', 'r')
        mock_zip.extractall.assert_called_once_with(market_data.output_folder)
        assert result == mock_zip

$ poetry run pytest

...

- name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install poetry 
        poetry lock
        poetry install --no-root
    - name: Set PYTHONPATH and Test with pytest  
      run: |
        export PYTHONPATH=$PYTHONPATH:$GITHUB_WORKSPACE/automate-elt-github/
        poetry run pytest

REDSHIFT_USERNAME = "your-username"
REDSHIFT_PASSWORD = "your-password"
REDSHIFT_HOST = "default-workgroup.your-iam-role.your-zone.redshift-serverless.amazonaws.com"
IAM_role = "arn:aws:iam::your-iam-role:role/redshift-your-role-space"

name: Routine Data Fetch and Push to Redshift

on:
  schedule:
    - cron: '0 2 * * *'  # This means 2 AM every day

jobs:
  fetch-and-push:
    runs-on: ubuntu-latest

    steps:
    - uses: actions/checkout@v3
    - name: Set up Python 3.10
      uses: actions/setup-python@v3
      with:
        python-version: "3.10"

    - name: Install Poetry using pip
      run: |
        python -m pip install poetry

    - name: Install dependencies using Poetry
      run: |
        poetry install

    - name: Run Ploomber pipeline to fetch and push data to Redshift
      env:
        REDSHIFT_USERNAME: ${{ secrets.REDSHIFT_USER }}
        REDSHIFT_PASSWORD: ${{ secrets.REDSHIFT_PASS }}
        REDSHIFT_HOST: ${{ secrets.REDSHIFT_HOST }}
        IAM_role: ${{ secrets.IAM_role }}
      run: |
        cd pipeline
        poetry run ploomber build

	account_id	frequency	account_creation_date	trans_id	transaction_date	transaction_type	operation	transaction_amount	balance	order_id	bank_to	account_to	order_amount
count	1715140	1715140	1715140	1715140	1715140	1715140	1448066	1715140	1715140	1715140	1715140	1715140	1715140
unique	3758	3	1468	913010	2191	3	5	36147	494619	6471	13	6446	4412
top	nan	POPLATEK MESICNE	nan	nan	nan	VYDAJ	VYBER	nan	nan	nan	YZ	nan	nan
freq	nan	1566570	nan	nan	nan	1102627	634299	nan	nan	nan	139314	nan	nan
mean	3094.1083	nan	944777.5185	1331562.3713	965826.2771	nan	nan	5635.1784	38775.0580	33974.2133	nan	49244253.5220	3164.0519
std	2640.1109	nan	14117.7740	1214076.4248	13890.0655	nan	nan	9205.2246	21598.1211	3918.6348	nan	28917744.6902	2690.0998
min	1	nan	930101	1	930101	nan	nan	0.0	-35456.6	29401	nan	399	1.0
25%	1249.0000	nan	930728.0000	431994.0000	960126.0000	nan	nan	150.0000	22898.8000	31228.0000	nan	23890793.0000	1069.0000
50%	2493.0000	nan	940822.0000	859667.0000	970415.0000	nan	nan	1974.0000	33696.4000	33086.0000	nan	49547737.0000	2454.0000
75%	3751.0000	nan	960316.0000	2121552.0000	980228.0000	nan	nan	6388.0000	49760.4000	34928.0000	nan	73936581.0000	4545.0000
max	11362	nan	971229	3682987	981231	nan	nan	87400.0	209637.0	46338	nan	99994199	14882.0

	region	avg_district_salary	avg_transaction_amount
0	east Bohemia	8625.323651	5684.243681
1	south Bohemia	8793.189384	5764.042859
2	west Bohemia	8986.326405	5393.660454
3	south Moravia	9139.420950	5304.807736
4	north Bohemia	9250.226137	5527.467883

Config	value
displaycon	False
feedback	True
autopandas	True
named_parameters	True

Automating ELT with GitHub actions

Laura G. Funderburk

Workshop structure

Tools we'll be working with throughout this workshop

The pipeline we will develop today

About me

Section 1: Introduction to ELT

What is ELT?

Why is ETL Important?

ELT vs ETL

Overview of JupySQL

Benefits of JupySQL

Overview of DuckDB

Overview of Amazon Redshift

OLAP vs OLTP

Installing and Loading JupySQL and Duckdb in Jupyter Notebooks

Hands-on exercise

Configuring JupySQL's behaviour

Data we will work with

Section 2: Data extraction, loading and wrangling working example

Extracting data

Data download and extraction

Initializing the pipeline orchestration process

Directory structure - before running the pipeline

Hands-on exercise

Executing the pipeline

Directory structure - after running the pipeline

Goal: upgrade this pipeline to load the data onto a database, and further transform it using SQL

Loading the extracted data, then transforming on a DuckDB instance

EDA within Jupyter with JupySQL

Data profiling

Which district has the highest number of entrepreneurs per 1000 inhabitants?

Which region has the highest average salary?

How many clients are there in each region?

Is there a correlation between the average salary of a district and the average transaction amount of clients from that district?

Break

Section 3: Introduction to GitHub actions

What is GitHub Actions?

Why GitHub Actions?

Core Concepts

Sample workflows

Section 4: CI/CD of ETL Processes with GitHub Actions

Sample testing with pytest

Sample testing with pytest

Running the tests locally

Adding testing to GitHub actions

Hands-on exercise

Section 5: Deploying your ETL/ELT pipeline to Amazon Redshift

Adding Redshift data population to our pipeline

Final thoughts