# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/visualizations/Viz_RecencyofPurchase.png
/kaggle/input/visualizations/Viz_CustomerYearlyComparison.png
/kaggle/input/visualizations/Viz_QuarterlySalesYearComparison.png
/kaggle/input/visualizations/Viz_ParetoAnalysisCustomers.png
/kaggle/input/visualizations/Viz_SalesVolumeVsUnitPrice.png
/kaggle/input/visualizations/Viz_topPerformersSalesVolume.png
/kaggle/input/visualizations/Viz_AverageSpend.png
/kaggle/input/visualizations/Viz_FrequencyOfPurchase.png
/kaggle/input/visualizations/Viz_QuarterlySales.png
/kaggle/input/visualizations/Viz_topPerformers.png
/kaggle/input/visualizations/Viz_SalesByCountry.png
/kaggle/input/visualizations/Viz_ParetoAnalysisItems.png
/kaggle/input/online-retails-sale-dataset/Online Retail.csv


df = pd.read_csv('/kaggle/input/online-retails-sale-dataset/Online Retail.csv')

# Check the first few rows of the dataset
df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   InvoiceDate  541909 non-null  object 
 2   InvoiceTime  541909 non-null  object 
 3   StockCode    541909 non-null  object 
 4   Description  540455 non-null  object 
 5   Quantity     541909 non-null  int64  
 6   UnitPrice    541909 non-null  float64
 7   Totalsale    541909 non-null  float64
 8   CustomerID   406829 non-null  float64
 9   Country      541909 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 41.3+ MB


# Convert data types
# Convert 'InvoiceDate' to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce')
# Merge 'InvoiceDate' and 'InvoiceTime' to create 'InvoiceDatetime'
df['InvoiceDateTime'] = pd.to_datetime(df['InvoiceDate'].astype(str) + ' ' + df['InvoiceTime'], errors='coerce')
# Drop the separate columns 
df.drop(columns=['InvoiceDate', 'InvoiceTime'], inplace=True)

# Convert CustomerID to string
df['CustomerID'] = df['CustomerID'].astype('str')

# Check data types again
print(df.dtypes)

InvoiceNo                  object
StockCode                  object
Description                object
Quantity                    int64
UnitPrice                 float64
Totalsale                 float64
CustomerID                 object
Country                    object
InvoiceDateTime    datetime64[ns]
dtype: object


# Create a new column for the quarter
df['Quarter'] = df['InvoiceDateTime'].dt.quarter

# Check the first few rows to see the result
print(df[['InvoiceDateTime', 'Quarter']].head())

      InvoiceDateTime  Quarter
0 2010-01-12 08:26:00      1.0
1 2010-01-12 08:26:00      1.0
2 2010-01-12 08:26:00      1.0
3 2010-01-12 08:26:00      1.0
4 2010-01-12 08:26:00      1.0


# Get summary statistics for the numeric columns
summary_stats = df.describe()
print(summary_stats)

            Quantity      UnitPrice      Totalsale  \
count  541909.000000  541909.000000  541909.000000   
mean        9.552250       4.611114      17.987795   
min    -80995.000000  -11062.060000 -168469.600000   
25%         1.000000       1.250000       3.400000   
50%         3.000000       2.080000       9.750000   
75%        10.000000       4.130000      17.400000   
max     80995.000000   38970.000000  168469.600000   
std       218.081158      96.759853     378.810824   

                     InvoiceDateTime        Quarter  
count                         232959  232959.000000  
mean   2011-05-14 03:14:20.515884800       2.523890  
min              2010-01-12 08:26:00       1.000000  
25%              2011-03-04 11:36:00       2.000000  
50%              2011-06-09 10:46:00       3.000000  
75%              2011-09-06 10:46:00       3.000000  
max              2011-12-10 17:19:00       4.000000  
std                              NaN       1.057696


# Check for negative quantities and select relevant columns
negative_quantity = df[df['Quantity'] < 0][['InvoiceNo', 'CustomerID','InvoiceDateTime', 'Quantity', 'UnitPrice', 'Totalsale']]
print("Negative Quantities:\n", negative_quantity)

# Check for negative totalsales and select relevant columns
negative_totalsale = df[df['Totalsale'] < 0][['InvoiceNo', 'InvoiceDateTime', 'Quantity', 'UnitPrice', 'Totalsale']]
# print("Negative Totalsale:\n", negative_totalsale)

Negative Quantities:
        InvoiceNo CustomerID     InvoiceDateTime  Quantity  UnitPrice  \
141      C536379    14527.0 2010-01-12 09:41:00        -1      27.50   
154      C536383    15311.0 2010-01-12 09:49:00        -1       4.65   
235      C536391    17548.0 2010-01-12 10:24:00       -12       1.65   
236      C536391    17548.0 2010-01-12 10:24:00       -24       0.29   
237      C536391    17548.0 2010-01-12 10:24:00       -24       0.29   
...          ...        ...                 ...       ...        ...   
540449   C581490    14397.0 2011-09-12 09:57:00       -11       0.83   
541541   C581499    15498.0 2011-09-12 10:28:00        -1     224.69   
541715   C581568    15311.0 2011-09-12 11:57:00        -5      10.95   
541716   C581569    17315.0 2011-09-12 11:58:00        -1       1.25   
541717   C581569    17315.0 2011-09-12 11:58:00        -5       1.25   

        Totalsale  
141        -27.50  
154         -4.65  
235        -19.80  
236         -6.96  
237         -6.96  
...           ...  
540449      -9.13  
541541    -224.69  
541715     -54.75  
541716      -1.25  
541717      -6.25  

[10624 rows x 6 columns]


returns = df[df['Quantity'] < 0]

# Identify sales (positive quantity)
sales = df[df['Quantity'] > 0]

# Merge returns with sales on CustomerID and StockCode
merged = pd.merge(returns, sales, on=['CustomerID', 'StockCode'], how='left', suffixes=('_return', '_sale'))

# Check for cases where there is a negative quantity but no matching positive quantity
no_sales_for_returns = merged[merged['Quantity_sale'].isnull()]

# Display the returns without corresponding sales
print("Returns without corresponding sales:\n", no_sales_for_returns[['CustomerID', 'StockCode', 'Quantity_return']])

# Check if all negative quantities have corresponding positive quantities
if no_sales_for_returns.empty:
    print("All negative quantities have corresponding positive quantities.")
else:
    print(f"There are {no_sales_for_returns.shape[0]} returns without corresponding sales.")

Returns without corresponding sales:
       CustomerID StockCode  Quantity_return
0        14527.0         D               -1
5        17548.0     22556              -12
6        17548.0     21984              -24
7        17548.0     21983              -24
8        17548.0     21980              -24
...          ...       ...              ...
79056    17490.0         M               -1
79078        nan    84801A             -102
79419    16933.0     22627               -1
79431    14442.0     21533               -1
79584    15498.0         M               -1

[1261 rows x 3 columns]
There are 1261 returns without corresponding sales.


# Filter for records where CustomerID is NaN or empty
returns_with_missing_customerID = no_sales_for_returns[no_sales_for_returns['CustomerID'].isnull()]
nan_customerID = no_sales_for_returns[no_sales_for_returns['CustomerID'].isna()] 
# Display the results
print("Returns without corresponding sales and missing CustomerID:\n", returns_with_missing_customerID)
print(f"Number of records with missing CustomerID: {returns_with_missing_customerID.shape[0]}")

# Display the rows with missing CustomerID
print(nan_customerID)
print(f"Number of rows with missing CustomerID: {nan_customerID.shape[0]}")

Returns without corresponding sales and missing CustomerID:
 Empty DataFrame
Columns: [InvoiceNo_return, StockCode, Description_return, Quantity_return, UnitPrice_return, Totalsale_return, CustomerID, Country_return, InvoiceDateTime_return, Quarter_return, InvoiceNo_sale, Description_sale, Quantity_sale, UnitPrice_sale, Totalsale_sale, Country_sale, InvoiceDateTime_sale, Quarter_sale]
Index: []
Number of records with missing CustomerID: 0
Empty DataFrame
Columns: [InvoiceNo_return, StockCode, Description_return, Quantity_return, UnitPrice_return, Totalsale_return, CustomerID, Country_return, InvoiceDateTime_return, Quarter_return, InvoiceNo_sale, Description_sale, Quantity_sale, UnitPrice_sale, Totalsale_sale, Country_sale, InvoiceDateTime_sale, Quarter_sale]
Index: []
Number of rows with missing CustomerID: 0


# Check if 'nan' is present as a string in CustomerID
string_nan_customerID = no_sales_for_returns[no_sales_for_returns['CustomerID'] == 'nan']

# Display the rows where 'CustomerID' is 'nan' (as a string)
#print(string_nan_customerID)
print(f"Number of rows with 'nan' as a string: {string_nan_customerID.shape[0]}")

Number of rows with 'nan' as a string: 313


# Replace 'nan' strings in CustomerID with actual NaN
no_sales_for_returns.loc[no_sales_for_returns['CustomerID'] == 'nan', 'CustomerID'] = np.nan

# Verify if the conversion worked
nan_customerID = no_sales_for_returns[no_sales_for_returns['CustomerID'].isna()]
print(f"Number of rows with missing CustomerID after conversion: {nan_customerID.shape[0]}")

Number of rows with missing CustomerID after conversion: 313


df.to_csv('cleanedRetailsDataset.csv', index=False)


df.head()


topProductsVolume = df.groupby(['StockCode', 'Description'])['Quantity'].sum().sort_values(ascending=False)
print(topProductsVolume.head(10))

StockCode  Description                       
84077      WORLD WAR 2 GLIDERS ASSTD DESIGNS     53847
85099B     JUMBO BAG RED RETROSPOT               47363
84879      ASSORTED COLOUR BIRD ORNAMENT         36381
22197      POPCORN HOLDER                        36334
21212      PACK OF 72 RETROSPOT CAKE CASES       36039
85123A     WHITE HANGING HEART T-LIGHT HOLDER    35025
23084      RABBIT NIGHT LIGHT                    30680
22492      MINI PAINT SET VINTAGE                26437
22616      PACK OF 12 LONDON TISSUES             26315
21977      PACK OF 60 PINK PAISLEY CAKE CASES    24753
Name: Quantity, dtype: int64


topProductsRevenue = df.groupby(['StockCode', 'Description'])['Totalsale'].sum().sort_values(ascending=False)
print(topProductsRevenue.head(10))

StockCode  Description                       
DOT        DOTCOM POSTAGE                        206245.48
22423      REGENCY CAKESTAND 3 TIER              164762.19
47566      PARTY BUNTING                          98302.98
85123A     WHITE HANGING HEART T-LIGHT HOLDER     97715.99
85099B     JUMBO BAG RED RETROSPOT                92356.03
23084      RABBIT NIGHT LIGHT                     66756.59
POST       POSTAGE                                66230.64
22086      PAPER CHAIN KIT 50'S CHRISTMAS         63791.94
84879      ASSORTED COLOUR BIRD ORNAMENT          58959.73
79321      CHILLI LIGHTS                          53768.06
Name: Totalsale, dtype: float64


topRegions = df.groupby(['Country'])['Totalsale'].sum().sort_values(ascending = False)
print(topRegions.head(10))

Country
United Kingdom    8187806.36
Netherlands        284661.54
EIRE               263276.82
Germany            221698.21
France             197403.90
Australia          137077.27
Switzerland         56385.35
Spain               54774.58
Belgium             40910.96
Sweden              36595.91
Name: Totalsale, dtype: float64


quartersRanked = df.groupby(['Quarter'])['Totalsale'].sum().sort_values(ascending = False)
print(quartersRanked)

Quarter
3.0    1177377.86
2.0    1091831.55
4.0     966163.93
1.0     912905.23
Name: Totalsale, dtype: float64


def categorize_sales_period(date):
    if (date.month in [1, 2]):  
        return 'Winter Clearance'
    elif (date.month in [3, 4, 5]):  
        return 'Spring Sales'
    elif (date.month in [6, 7]):  
        return 'Summer Promotions'
    elif (date.month == 8):  
        return 'Back-to-School'
    elif (date.month == 9):  
        return 'Back-to-School'
    elif (date.month == 10):  
        return 'Early Holiday Shopping'
    elif (date.month in [11, 12]):  
        return 'Holiday Shopping'
    else:  
        return 'Other'

# Using loc to filter out records with invalid or empty InvoiceDateTime values
df.loc[df['InvoiceDateTime'].notnull(), 'SalesPeriod'] = df.loc[df['InvoiceDateTime'].notnull(), 'InvoiceDateTime'].apply(categorize_sales_period)
sales_by_period = df[df['SalesPeriod'].notnull()].groupby('SalesPeriod')['Totalsale'].sum().reset_index()
sales_by_period = sales_by_period.sort_values(by='Totalsale', ascending=False)

print(sales_by_period)

              SalesPeriod   Totalsale
3            Spring Sales  1019098.83
4       Summer Promotions   830650.70
0          Back-to-School   727597.99
2        Holiday Shopping   621684.87
5        Winter Clearance   604767.12
1  Early Holiday Shopping   344479.06


import matplotlib.pyplot as plt
import matplotlib.image as mpimg

img_path = "/kaggle/input/visualizations/Viz_topPerformersSalesVolume.png"
img = mpimg.imread(img_path)
plt.imshow(img)
plt.axis('off') 
plt.show()


img_path = "/kaggle/input/visualizations/Viz_topPerformers.png"
img = mpimg.imread(img_path)
plt.imshow(img)
plt.axis('off') 
plt.show()


img_path = "/kaggle/input/visualizations/Viz_SalesVolumeVsUnitPrice.png"
img = mpimg.imread(img_path)
plt.imshow(img)
plt.axis('off') 
plt.show()


img_path = "/kaggle/input/visualizations/Viz_SalesByCountry.png"
img = mpimg.imread(img_path)
plt.imshow(img)
plt.axis('off') 
plt.show()


img_path = "/kaggle/input/visualizations/Viz_QuarterlySales.png"
img = mpimg.imread(img_path)
plt.imshow(img)
plt.axis('off') 
plt.show()


img_path = "/kaggle/input/visualizations/Viz_QuarterlySalesYearComparison.png"
img = mpimg.imread(img_path)
plt.imshow(img)
plt.axis('off') 
plt.show()


img_path = "/kaggle/input/visualizations/Viz_CustomerYearlyComparison.png"
img = mpimg.imread(img_path)
plt.imshow(img)
plt.axis('off') 
plt.show()


img_path = "/kaggle/input/visualizations/Viz_ParetoAnalysisItems.png"
img = mpimg.imread(img_path)
plt.imshow(img)
plt.axis('off') 
plt.show()


img_path = "/kaggle/input/visualizations/Viz_ParetoAnalysisCustomers.png"
img = mpimg.imread(img_path)
plt.imshow(img)
plt.axis('off') 
plt.show()


img_path = "/kaggle/input/visualizations/Viz_FrequencyOfPurchase.png"
img = mpimg.imread(img_path)
plt.imshow(img)
plt.axis('off') 
plt.show()


img_path = "/kaggle/input/visualizations/Viz_AverageSpend.png"
img = mpimg.imread(img_path)
plt.imshow(img)
plt.axis('off') 
plt.show()


img_path = "/kaggle/input/visualizations/Viz_RecencyofPurchase.png"
img = mpimg.imread(img_path)
plt.imshow(img)
plt.axis('off') 
plt.show()

	InvoiceNo	InvoiceDate	InvoiceTime	StockCode	Description	Quantity	UnitPrice	Totalsale	CustomerID	Country
0	536365	01-12-2010	08:26:00 AM	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	2.55	15.30	17850.0	United Kingdom
1	536365	01-12-2010	08:26:00 AM	71053	WHITE METAL LANTERN	6	3.39	20.34	17850.0	United Kingdom
2	536365	01-12-2010	08:26:00 AM	84406B	CREAM CUPID HEARTS COAT HANGER	8	2.75	22.00	17850.0	United Kingdom
3	536365	01-12-2010	08:26:00 AM	84029G	KNITTED UNION FLAG HOT WATER BOTTLE	6	3.39	20.34	17850.0	United Kingdom
4	536365	01-12-2010	08:26:00 AM	84029E	RED WOOLLY HOTTIE WHITE HEART.	6	3.39	20.34	17850.0	United Kingdom

	InvoiceNo	StockCode	Description	Quantity	UnitPrice	Totalsale	CustomerID	Country	InvoiceDateTime	Quarter
0	536365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	2.55	15.30	17850.0	United Kingdom	2010-01-12 08:26:00	1.0
1	536365	71053	WHITE METAL LANTERN	6	3.39	20.34	17850.0	United Kingdom	2010-01-12 08:26:00	1.0
2	536365	84406B	CREAM CUPID HEARTS COAT HANGER	8	2.75	22.00	17850.0	United Kingdom	2010-01-12 08:26:00	1.0
3	536365	84029G	KNITTED UNION FLAG HOT WATER BOTTLE	6	3.39	20.34	17850.0	United Kingdom	2010-01-12 08:26:00	1.0
4	536365	84029E	RED WOOLLY HOTTIE WHITE HEART.	6	3.39	20.34	17850.0	United Kingdom	2010-01-12 08:26:00	1.0

Online Retail Sales Analysis: A Data-Driven Approach to Optimizing Sales and Customer Insights¶

Project Goals¶

Potential Insights¶

Import Necessary Libraries¶

Verify the Dataset¶

Data Cleaning¶

InvoiceDate:¶

InvoiceDateTime:¶

CustomerID:¶

Other Columns:¶

Initial Exploratory Data Analysis¶

Top Products by Sales Volume¶

Top Products by Revenue¶

Analyze Regional Sales¶

Seasonal Trends¶

Visualizations in Tableau and Analysis¶

Top Performing Products¶

Sales Volume Vs. Unit Price¶

Sales Distribution By Country¶

Quarterly Sales¶

Pareto Analysis¶

Customer Segmentation¶