# Load libraries ----
# Work horse packages
library(tidyverse)
library(lubridate)
# theme_tq()
library(tidyquant)
# Excel Files
library(readxl)
library(writexl)
# Visualization
library(plotly)
# Preprocessing and Sampling
library(recipes)
library(rsample)
# Model Error Metrics
library(yardstick)
# Modeling
library(parsnip)
library(glmnet)
library(randomForest)
library(xgboost)
# Plot Decision Trees
library(rpart)
library(rpart.plot)
library(ggrepel)
library(ranger)
library(kernlab)
library(broom)
library(uwot)
# Importing Files ----
bikes_tbl <- read_excel(path = "00_data/bike_sales/data_raw/bikes.xlsx")
bikeshops_tbl <- read_excel(path = "00_data/bike_sales/data_raw/bikeshops.xlsx")
orderlines_tbl <- read_excel(path = "00_data/bike_sales/data_raw/orderlines.xlsx")
source("00_scripts/plot_sales.R")
source("00_scripts/plot_customer_segmentation.R")
glimpse(bikes_tbl)
## Rows: 97
## Columns: 4
## $ bike.id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ...
## $ model <chr> "Supersix Evo Black Inc.", "Supersix Evo Hi-Mod Team", ...
## $ description <chr> "Road - Elite Road - Carbon", "Road - Elite Road - Carb...
## $ price <dbl> 12790, 10660, 7990, 5330, 4260, 3940, 3200, 2660, 2240,...
bikes_tbl
## # A tibble: 97 x 4
## bike.id model description price
## <dbl> <chr> <chr> <dbl>
## 1 1 Supersix Evo Black Inc. Road - Elite Road - Carbon 12790
## 2 2 Supersix Evo Hi-Mod Team Road - Elite Road - Carbon 10660
## 3 3 Supersix Evo Hi-Mod Dura Ace 1 Road - Elite Road - Carbon 7990
## 4 4 Supersix Evo Hi-Mod Dura Ace 2 Road - Elite Road - Carbon 5330
## 5 5 Supersix Evo Hi-Mod Utegra Road - Elite Road - Carbon 4260
## 6 6 Supersix Evo Red Road - Elite Road - Carbon 3940
## 7 7 Supersix Evo Ultegra 3 Road - Elite Road - Carbon 3200
## 8 8 Supersix Evo Ultegra 4 Road - Elite Road - Carbon 2660
## 9 9 Supersix Evo 105 Road - Elite Road - Carbon 2240
## 10 10 Supersix Evo Tiagra Road - Elite Road - Carbon 1840
## # ... with 87 more rows
glimpse(bikeshops_tbl)
## Rows: 30
## Columns: 3
## $ bikeshop.id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...
## $ bikeshop.name <chr> "Pittsburgh Mountain Machines", "Ithaca Mountain Clim...
## $ location <chr> "Pittsburgh, PA", "Ithaca, NY", "Columbus, OH", "Detr...
bikeshops_tbl
## # A tibble: 30 x 3
## bikeshop.id bikeshop.name location
## <dbl> <chr> <chr>
## 1 1 Pittsburgh Mountain Machines Pittsburgh, PA
## 2 2 Ithaca Mountain Climbers Ithaca, NY
## 3 3 Columbus Race Equipment Columbus, OH
## 4 4 Detroit Cycles Detroit, MI
## 5 5 Cincinnati Speed Cincinnati, OH
## 6 6 Louisville Race Equipment Louisville, KY
## 7 7 Nashville Cruisers Nashville, TN
## 8 8 Denver Bike Shop Denver, CO
## 9 9 Minneapolis Bike Shop Minneapolis, MN
## 10 10 Kansas City 29ers Kansas City, KS
## # ... with 20 more rows
glimpse(orderlines_tbl)
## Rows: 15,644
## Columns: 7
## $ ...1 <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"...
## $ order.id <dbl> 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7...
## $ order.line <dbl> 1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 1, 2, 3, 4, 1, 2, 3, 4, 1...
## $ order.date <dttm> 2015-01-07, 2015-01-07, 2015-01-10, 2015-01-10, 2015-0...
## $ customer.id <dbl> 2, 2, 10, 10, 6, 6, 6, 6, 6, 22, 8, 8, 8, 8, 16, 16, 16...
## $ product.id <dbl> 48, 52, 76, 52, 2, 50, 1, 4, 34, 26, 96, 66, 35, 72, 45...
## $ quantity <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1...
orderlines_tbl
## # A tibble: 15,644 x 7
## ...1 order.id order.line order.date customer.id product.id quantity
## <chr> <dbl> <dbl> <dttm> <dbl> <dbl> <dbl>
## 1 1 1 1 2015-01-07 00:00:00 2 48 1
## 2 2 1 2 2015-01-07 00:00:00 2 52 1
## 3 3 2 1 2015-01-10 00:00:00 10 76 1
## 4 4 2 2 2015-01-10 00:00:00 10 52 1
## 5 5 3 1 2015-01-10 00:00:00 6 2 1
## 6 6 3 2 2015-01-10 00:00:00 6 50 1
## 7 7 3 3 2015-01-10 00:00:00 6 1 1
## 8 8 3 4 2015-01-10 00:00:00 6 4 1
## 9 9 3 5 2015-01-10 00:00:00 6 34 1
## 10 10 4 1 2015-01-11 00:00:00 22 26 1
## # ... with 15,634 more rows
bike_orderlines_joined_tbl <- orderlines_tbl %>%
left_join(bikes_tbl, by = c("product.id" = "bike.id")) %>%
left_join(bikeshops_tbl, by = c("customer.id" = "bikeshop.id"))
bike_orderlines_tbl <- bike_orderlines_joined_tbl %>%
separate(
description,
into = c("category_1", "category_2", "frame_material"),
sep = " - "
) %>%
separate(location,
into = c("city", "state"),
sep = ", ") %>%
mutate(total_price = quantity * price) %>%
select(-...1,-ends_with(".id")) %>%
bind_cols(bike_orderlines_joined_tbl %>%
select(order.id)) %>%
select(
contains("date"),
contains("id"),
contains("order"),
quantity,
price,
total_price,
everything()
) %>%
set_names(names(.) %>% str_replace_all("\\.", "_"))
sales_by_year_category_2_tbl <- bike_orderlines_tbl %>%
select(order_date, category_2, total_price) %>%
mutate(order_date = ymd(order_date)) %>%
mutate(year = year(order_date)) %>%
group_by(category_2, year) %>%
summarize(revenue = sum(total_price)) %>%
ungroup() %>%
mutate(category_2 = fct_reorder2(category_2, year, revenue))
# Step 1 - Manipulate
sales_by_year_tbl <- bike_orderlines_tbl %>%
# Get columns we want
select(order_date, total_price) %>%
mutate(year = year(order_date)) %>%
# groupings
group_by(year) %>%
summarize(sales = sum(total_price)) %>%
ungroup() %>%
# get dollar text
mutate(sales_text = scales::dollar(sales))
# Step 2 - Visualize
sales_by_year_tbl %>%
ggplot(aes(x = year, y = sales)) +
geom_col(fill = "#2C3E50") +
geom_label(aes(label = sales_text)) +
geom_smooth(method = "lm",
se = FALSE) +
theme_tq() +
scale_y_continuous(labels = scales::dollar) +
labs(
title = "Revenue by Year",
subtitle = "There's an upward trend",
x = "",
y = "Revenue"
)
revenue_by_year_tbl <- bike_orderlines_tbl %>%
select(order_date, total_price) %>%
mutate(year = year(order_date)) %>%
group_by(year) %>%
summarize(revenue = sum(total_price)) %>%
ungroup()
revenue_by_year_tbl %>%
# Canvas
ggplot(aes(x = year, y = revenue, color = revenue)) +
# Geometries
geom_line(size = 1) +
geom_point(size = 5) +
geom_smooth(method = "lm", se = FALSE) +
# Formatting
expand_limits(y = 0) +
scale_color_continuous(low = "red", high = "black",labels = scales::dollar_format(scale = 1/1e6, suffix = "M")) +
scale_y_continuous(labels = scales::dollar_format(scale = 1/1e6, suffix = "M")) +
labs(
title = "Revenue",
subtitle = "Sales are trending up and to the right!",
x = "",
y = "Sales (Millions)",
color = "Rev ($M)",
caption = "What's happening?\nSales numbers showing year-over-year growth.") +
theme_bw() +
theme(legend.position = "right", legend.direction = "vertical")
sales_by_year_category_2_tbl %>%
ggplot(mapping = aes(x = year, y = revenue, color = revenue)) +
geom_line(size = 1) +
geom_point(size = 3) +
facet_wrap(~ category_2, scales = "free_y") +
expand_limits(y = 0) +
scale_y_continuous(labels = scales::dollar_format(scale = 1e-6, suffix = "M")) +
geom_smooth(method = "lm", se = FALSE) +
theme_light() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
strip.background = element_rect(
color = "black",
fill = "cornflowerblue",
size = 1),
strip.text = element_text(face = "bold", color = "white")) +
labs(
title = "Positive trending sales",
caption = "5-year sales trends comes from our ERP database",
x = "",
y = ""
)
Describe revenue by Month, expose cyclic nature.
# Data Manipulation
revenue_by_month_tbl <- bike_orderlines_tbl %>%
select(order_date, total_price) %>%
mutate(year_month = floor_date(order_date, "months") %>% ymd()) %>%
group_by(year_month) %>%
summarize(revenue = sum(total_price)) %>%
ungroup()
# Line Plot
revenue_by_month_tbl %>%
ggplot(
mapping = aes(x = year_month, y = revenue)) +
geom_line() +
geom_smooth(span = 0.2) +
theme_classic()
# Step 1 - Manipulate
sales_by_year_cat_2_tbl <- bike_orderlines_tbl %>%
select(order_date, category_2, total_price) %>%
mutate(year = year(order_date)) %>%
group_by(year, category_2) %>%
summarize(sales = sum(total_price)) %>%
ungroup() %>%
mutate(sales_text = scales::dollar(sales))
# Step 2 - Visualize
sales_by_year_cat_2_tbl %>%
ggplot(aes(x = year, y = sales, fill = category_2)) +
geom_col() +
geom_smooth(method = "lm",
se = FALSE) +
facet_wrap(~ category_2, scales = "free_y") +
theme_tq() +
scale_fill_tq() +
scale_y_continuous(labels = scales::dollar) +
labs(
title = "Revenue by year and category 2",
subtitle = "Each product category has an upward trend",
x = "",
y = "Revenue",
fill = "Product Secondary Category"
)
# - Great way to tease out variation by category
sales_by_year_category_2_tbl %>%
ggplot(mapping = aes(x = year, y = revenue, color = category_2)) +
geom_line(size = 1, color = "black") +
geom_point() +
geom_smooth(method = "lm", se = F) +
facet_wrap(~ category_2, scales = "free_y") +
scale_y_continuous(
labels = scales::dollar_format(scale = 1/1e6, suffix = "M")) +
expand_limits(y = 0) +
labs(title = "Sales by Category 2", color = "Category 2", x = "", y = "Revenue") +
theme_light() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
strip.background = element_rect(
color = "black",
fill = "cornflowerblue",
size = 1),
strip.text = element_text(face = "bold", color = "white"))
# Bar / Column Plots ---- Categories
revenue_by_category_tbl <- bike_orderlines_tbl %>%
select(category_2, category_1, total_price) %>%
group_by(category_2, category_1) %>%
summarise(total_revenue = sum(total_price)) %>%
ungroup() %>%
arrange(desc(total_revenue)) %>%
mutate(category_2 = as_factor(category_2) %>% fct_rev())
# Bar Plot
g <- revenue_by_category_tbl %>%
ggplot(aes(category_2, total_revenue, fill = category_1)) +
# Geoms
geom_col() +
coord_flip() +
# Formatting
scale_fill_tq() +
scale_y_continuous(labels = scales::dollar_format(scale = 1e-6, suffix = "M")) +
theme_tq() +
labs(
title = "Total Revenue by Category",
x = "", y = "", fill = ""
)
ggplotly(g)
# Fill -----
# - Used with fill of rectangular objects.
sales_by_year_category_2_tbl %>%
ggplot(mapping = aes(x = year, y = revenue, fill = category_2)) +
geom_col() +
scale_y_continuous(labels = scales::dollar_format(scale = 1/1e6, suffix = "M"))
# Stacked Area
sales_by_year_category_2_tbl %>%
ggplot(mapping = aes(x = year, y = revenue, fill = category_2)) +
geom_area(color = "black") +
scale_fill_brewer(palette = "Blues", direction = -1) +
scale_y_continuous(labels = scales::dollar_format(scale = 1e-6, suffix = "M")) +
labs(
title = "Sales over year by secondary category",
subtitle = "Sales trending upward",
caption = "Data comes from our ERP database",
x = "",
y = "Revenue ($M)",
fill = "Category 2") +
theme_light() +
theme(
title = element_text(face = "bold", color = "#08306B"))
# Histogram / Density Plots
# Inspecting the distribution of a variable
bike_orderlines_tbl %>%
distinct(model, price) %>%
ggplot(mapping = aes(x = price)) +
geom_histogram(bins = 25, fill = "blue", color = "white") +
tidyquant::theme_tq()
# Histogram
bike_orderlines_tbl %>%
distinct(price, model, frame_material) %>%
ggplot(mapping = aes(x = price, fill = frame_material)) +
geom_histogram() +
facet_wrap(~ frame_material, ncol = 1) +
tidyquant::theme_tq() +
tidyquant::scale_fill_tq() +
theme_classic()
# Density
bike_orderlines_tbl %>%
distinct(price, model, frame_material) %>%
ggplot(mapping = aes(x = price, fill = frame_material)) +
geom_density(alpha = 0.618) +
tidyquant::scale_fill_tq() +
tidyquant::theme_tq()
# Box Plot / Violin Plot (Comparing distributions)
# Data Manipulation
unit_price_by_cat2_tbl <- bike_orderlines_tbl %>%
distinct(category_2, model, price) %>%
mutate(category_2 = category_2 %>% as_factor() %>% fct_reorder(price))
# Box Plot
unit_price_by_cat2_tbl %>%
ggplot(mapping = aes(x = category_2, y = price)) +
geom_boxplot() +
coord_flip() +
tidyquant::theme_tq()
# Violin Plot & Jitter Plot
unit_price_by_cat2_tbl %>%
ggplot(mapping = aes(x = category_2, y = price)) +
geom_violin() +
geom_jitter(width = 0.2, color = "#2c3e50") +
coord_flip() +
tidyquant::theme_tq()
Visualize top N customers in terms of Revenue, include cumulative percentage
n <- 15
# Data Manipulation
top_customers_tbl <- bike_orderlines_tbl %>%
select(bikeshop_name, total_price) %>%
mutate(bikeshop_name = bikeshop_name %>% as_factor() %>% fct_lump(n = n, w = total_price)) %>%
group_by(bikeshop_name) %>%
summarise(revenue = sum(total_price)) %>%
ungroup() %>%
mutate(bikeshop_name = bikeshop_name %>% fct_reorder(revenue)) %>%
mutate(bikeshop_name = bikeshop_name %>% fct_relevel("Other", after = 0)) %>%
arrange(desc(bikeshop_name)) %>%
# revenue text
mutate(revenue_text = scales::dollar(revenue, scale = 1e-6, suffix = "M")) %>%
# cumulative percent
mutate(cum_pct = cumsum(revenue)/sum(revenue)) %>%
mutate(cum_pct_text = scales::percent(cum_pct)) %>%
# Get a rank
mutate(rank = row_number()) %>%
mutate(rank = case_when(
rank == max(rank) ~ NA_integer_,
TRUE ~ rank)) %>%
# Label Text
mutate(label_text = str_glue("Rank: {rank}\nRev: {revenue_text}\nCumPct: {cum_pct_text}"))
# Data Visualization
top_customers_tbl %>%
ggplot(mapping = aes(x = revenue, y = bikeshop_name)) +
geom_segment(mapping = aes(xend = 0, yend = bikeshop_name), color = palette_light()[1], size = 1) +
geom_point(color = palette_light()[1], mapping = aes(size = revenue)) +
geom_label(mapping = aes(label = label_text), hjust = "inward", size = 3, color = palette_light()[1]) +
# Formatting
scale_x_continuous(labels = scales::dollar_format(scale = 1e-6, suffix = "M")) +
labs(title = str_glue("Top {n} Customers"), subtitle = str_glue("Start: {year(min(bike_orderlines_tbl$order_date))}
End: {year(max(bike_orderlines_tbl$order_date))}"),
x = "Revenue ($M)",
y = "Customer",
caption = str_glue("Top 6 customers contribute 51% of purchasing power")) +
theme_tq() +
theme(legend.position = "none", plot.title = element_text(face = "bold"),
plot.caption = element_text(face = "bold"))
Marketing would like to increase email campaign engagement by segmenting the customer-base using their buying habits.
Customer Trends: Customer purchase history for similarity to other “like” customers.
Our customer-base consists of 30 bike shops. Several customers have purchasing preferences for Road or Mountain Bikes based on the proportion of bikes purchased by category_1 and category_2.
Heatmap of proportion of sales by secondary product category
# Data Manipulation
pct_sales_by_customer_tbl <- bike_orderlines_tbl %>%
select(bikeshop_name, category_1, category_2, quantity) %>%
group_by(bikeshop_name, category_1, category_2) %>%
summarise(total_quantity = sum(quantity, na.rm = TRUE)) %>%
ungroup() %>%
group_by(bikeshop_name) %>%
mutate(pct = total_quantity/sum(total_quantity, na.rm = TRUE)) %>%
ungroup() %>%
# List shops by alpha
mutate(bikeshop_name = as.factor(bikeshop_name) %>% fct_rev()) %>%
#mutate(bikeshop_name_num = bikeshop_name %>% as.numeric()) %>%
mutate(label_text = str_glue("Customer: {bikeshop_name}
Category = {category_1}
Sub-Category = {category_2}
Quantity Purchased: {total_quantity}
Percent of Sales: {scales::percent(pct)}"))
# Data Visualization
g <- pct_sales_by_customer_tbl %>%
ggplot(
mapping = aes(x = category_2, y = bikeshop_name)) +
# Geometries
geom_tile(
mapping = aes(fill = pct)) +
geom_text(
mapping = aes(
label = scales::percent(pct, accuracy = .01),
text = label_text), size = 3) +
facet_wrap(~ category_1, scales = "free_x") +
# Formatting
scale_fill_gradient(low = "white", high = palette_light()[1]) +
labs(
title = "Heatmap of Purchasing Habits",
x = "", #Bike Type (Cateogry 2)
y = "", #Customer
caption = str_glue("Customers that prefer Road: Ann Arbor Speed, Austin Cruisers, & Indianapolis Velocipedes
Customers that prefer Mountain: Ithica Mountain Climbers, Pittsburgh Mountain Machines, & Tampa 29ers")) +
theme_tq() +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1),
plot.caption = element_text(face = "bold.italic"),
plot.title = element_text(face = "bold"))
# strip.text.x = element_text(margin = margin(5,5,5,5, unit = "pt")))
ggplotly(g, tooltip = "text")
Explain relationship between order value and quantity of bikes sold.
# - Continuous vs Continuous
# Explain relationship between order value and quantity of bikes sold
# Data Manipulation
order_value_tbl <- bike_orderlines_tbl %>%
select(order_id, order_line, total_price, quantity) %>%
group_by(order_id) %>%
summarize(
total_quantity = sum(quantity),
total_price = sum(total_price)) %>%
ungroup()
# Scatter Plot
order_value_tbl %>%
ggplot(
mapping = aes(
x = total_quantity, y = total_price)) +
geom_point(alpha = 0.312, position = "jitter", size = 2) +
geom_smooth(method = "lm", se = F) +
theme_classic()
This is a 2D Projection based on customer similarity that exposes 4 clusters, which are key segments in the customer base.
# Plot customer segments
plot_customer_segments(interactive = interactive, k = 4, seed = 123)
# Plot customer segments
plot_customer_segments(interactive = FALSE, k = 4, seed = 123)
The 4 customer segments were given descriptions based on the customer’s top product purchases.
Segment 1 Preferences: Mountain Bikes, Above $3k
Segment 2 Preferences: Road Bikes, Above $3k
Segment 3 Preferences: Road Bikes, Below $3k
Segment 4 Preferences: Mountain Bikes, Below $3k
plot_customer_behavior_by_cluster(interactive = interactive, top_n_products = 10, k = 4, seed = 123)