import kagglehub

# Download latest version
path = kagglehub.dataset_download("dillonmyrick/bike-store-sample-database")

print("Path to dataset files:", path)

C:\Users\User\AppData\Roaming\Python\Python313\site-packages\tqdm\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

Downloading from https://www.kaggle.com/api/v1/datasets/download/dillonmyrick/bike-store-sample-database?dataset_version_number=3...

100%|██████████| 92.2k/92.2k [00:00<00:00, 2.18MB/s]

Extracting files...
Path to dataset files: C:\Users\User\.cache\kagglehub\datasets\dillonmyrick\bike-store-sample-database\versions\3

#The files have been downloaded to the path as individual tables and so I will create a database using sqlite3 and then populate it with it.
import pandas as pd
import sqlite3
import os

# Folder where your CSVs are stored
csv_folder = r"C:\Users\User\OneDrive\Documents\Data Engineering\data_bikes"

# Create a connection to an in-memory SQLite database
conn = sqlite3.connect(":memory:")

# Loop through all CSV files and load them as tables
for file in os.listdir(csv_folder):
    if file.endswith(".csv"):
        table_name = os.path.splitext(file)[0]  # Use filename (without .csv) as table name
        file_path = os.path.join(csv_folder, file)
        df = pd.read_csv(file_path)
        df.to_sql(table_name, conn, index=False, if_exists='replace')
        print(f"Loaded table: {table_name}")

# Example: List all tables
tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", conn)
print("Tables in database:")
print(tables)

# Example: Query one of the tables
query_result = pd.read_sql_query("SELECT * FROM customers LIMIT 5;", conn)
print(query_result)

# Close the connection when done
# conn.close()

Loaded table: brands
Loaded table: categories
Loaded table: customers
Loaded table: orders
Loaded table: order_items
Loaded table: products
Loaded table: staffs
Loaded table: stocks
Loaded table: stores
Tables in database:
          name
0       brands
1   categories
2    customers
3       orders
4  order_items
5     products
6       staffs
7       stocks
8       stores
   customer_id  first_name last_name           phone                    email  \
0            1       Debra     Burks            None    debra.burks@yahoo.com   
1            2       Kasha      Todd            None     kasha.todd@yahoo.com   
2            3      Tameka    Fisher            None    tameka.fisher@aol.com   
3            4       Daryl    Spence            None     daryl.spence@aol.com   
4            5  Charolette      Rice  (916) 381-6003  charolette.rice@msn.com   

                  street           city state  zip_code  
0      9273 Thorne Ave.    Orchard Park    NY     14127  
1       910 Vine Street        Campbell    CA     95008  
2  769C Honey Creek St.   Redondo Beach    CA     90278  
3        988 Pearl Lane       Uniondale    NY     11553  
4         107 River Dr.      Sacramento    CA     95820

#since I am using sqlite3, I can use the sqlite3 module to execute SQL queries directly on the database.
#   Find duplicate customers based on customer_id, first_name, last_name, email, and phone
#you can find duplicates based on particular columns for this data i duplicated some customer data for demonstration purposes.
# Define the SQL query to find duplicates

query = """
SELECT *, COUNT(*) as count
FROM customers
GROUP BY customer_id, first_name, last_name, email, phone
HAVING COUNT(*) > 1;
"""
# Execute the query and fetch the results   
duplicates = pd.read_sql_query(query, conn)
print(f"Duplicate customers: {duplicates}")

Duplicate customers:     customer_id first_name  last_name           phone  \
0            37      Melia      Brady            None   
1            38      Zelma   Browning            None   
2            39    Janetta    Aguirre  (717) 670-2634   
3            40      Ronna     Butler            None   
4            41     Kathie    Freeman            None   
5            42    Tangela      Quinn            None   
6            43    Mozelle     Carter  (281) 489-9656   
7            44      Onita      Johns            None   
8            45    Bennett  Armstrong            None   
9            46     Monika       Berg            None   
10           47  Bridgette     Guerra            None   
11           48      Cesar    Jackson            None   
12           49     Caroll       Hays            None   
13           50  Cleotilde      Booth            None   
14           51    Gertrud     Rhodes            None   
15           52         Tu    Ramirez            None   
16           53  Saturnina     Garner            None   
17           54       Fran       Yang            None   
18           55      Diana     Guerra            None   
19           56     Lolita     Mosley  (281) 363-3309   

                           email                    street           city  \
0          melia.brady@gmail.com          907 Shirley Rd.         Maspeth   
1         zelma.browning@aol.com        296 Second Street         Astoria   
2        janetta.aguirre@aol.com         214 Second Court       Lancaster   
3         ronna.butler@gmail.com      9438 Plymouth Court          Encino   
4         kathie.freeman@msn.com           667 Temple Dr.      Queensbury   
5          tangela.quinn@aol.com   4 S. Purple Finch Road   Richmond Hill   
6         mozelle.carter@aol.com        895 Chestnut Ave.         Houston   
7            onita.johns@msn.com       32 Glen Creek Lane          Elmont   
8      bennett.armstrong@aol.com        688 Walnut Street        Bethpage   
9          monika.berg@gmail.com           369 Vernon Dr.          Encino   
10  bridgette.guerra@hotmail.com         9982 Manor Drive     San Lorenzo   
11       cesar.jackson@gmail.com     8068 N. Griffin Ave.       Liverpool   
12         caroll.hays@yahoo.com        9381 Wrangler St.        Fairport   
13     cleotilde.booth@gmail.com            17 Corona St.      Sugar Land   
14        gertrud.rhodes@aol.com  9961 Meadowbrook Street         Merrick   
15        tu.ramirez@hotmail.com   24 W. Courtland Street   East Elmhurst   
16    saturnina.garner@gmail.com      8538 Fairground St.        Glendora   
17         fran.yang@hotmail.com            440 Pearl St.           Utica   
18      diana.guerra@hotmail.com           45 Chapel Ave.         Merrick   
19     lolita.mosley@hotmail.com    376 S. High Ridge St.         Houston   

   state  zip_code  count  
0     NY     11378      2  
1     NY     11102      2  
2     NY     14086      2  
3     CA     91316      2  
4     NY     12804      2  
5     NY     11418      2  
6     TX     77016      2  
7     NY     11003      2  
8     NY     11714      2  
9     CA     91316      2  
10    CA     94580      2  
11    NY     13090      2  
12    NY     14450      2  
13    TX     77478      2  
14    NY     11566      2  
15    NY     11369      2  
16    CA     91740      2  
17    NY     13501      2  
18    NY     11566      2  
19    TX     77016      2

#Handling duplicates
#   Remove duplicates based on customer_id, keeping the first occurrence
remove_duplicates_query = """
DELETE FROM customers
WHERE rowid NOT IN (
    SELECT MIN(rowid)
    FROM customers
    GROUP BY customer_id, first_name, last_name, email, phone
);
"""
# Execute the query to remove duplicates
conn.execute(remove_duplicates_query)   
# Commit the changes
conn.commit()
# Verify that duplicates have been removed
verify_query = """
SELECT *, COUNT(*) as count
FROM customers 
GROUP BY customer_id, first_name, last_name, email, phone
HAVING COUNT(*) > 1;
"""
# Execute the query and fetch the results
duplicates_after_removal = pd.read_sql_query(verify_query, conn)    
print(f"Duplicate customers after removal: {duplicates_after_removal}")

# now see if you can find duplicates in the orders table
#   Find duplicate orders based on order_id, customer_id, order_date, and total_amount
# Define the SQL query to find duplicates

Duplicate customers after removal: Empty DataFrame
Columns: [customer_id, first_name, last_name, phone, email, street, city, state, zip_code, count]
Index: []

Dealing with duplicate data in SQL

Python Notebook Preview

Project Details