title: “Itom 6253 Programming for Analytics Fall A 2021 – Assignment 1
html_document: default
pdf_document: default
date: “Fall A 2021”
“`{r set_global_options, echo=TRUE}
knitr::opts_chunk$set(echo = TRUE)
The raw data file for this assignment is full of problems. There are missing values, outliers and invalid values (e.g., negative revenue). There are extra commas at the end of each line that cause the creation of empty nonsense variables and a few lines are missing a comma to separate the last two variables. Many lines also have zero purchases for all products.
Before doing any analytics, we must first scrub the raw data to solve these problems. We make use of the **dplyr** and **tidyr** packages for many of the scrubbing tasks.
### Instructions
This template requires that you replace all of the instances of “***” with your own code. Sometimes you will need to supply just a single keyword or argument to a function, sometimes a single line of code, and sometimes you need to supply multiple lines of code. Do your best and try to run the entire markdown document without errors for submission.
Once you have finished, knit the document to a pdf file and submit your version of the markdown file (*.rmd) and the associated pdf to the assignment on Canvas.
“`{r echo=TRUE}
library(***)
library(***)
library(***)
library(readxl)
cbc <- as.data.frame(read_excel("***/Charles_BookClub_modified.xls")) cbc <- cbc[,***] summary(cbc) #### Count the unique values of each variable. cbc_counts <- cbc *** summarise_all(***) cbc_counts #### Convert Gender and Florence to Factors ***Gender <- ***(cbc$Gender, *** = c("Female", "Male")) ***Florence <- ***(cbc$Florence, *** = c("No", "Yes")) #### Identify the numeric columns for which outlier detection is desired outvars <- c("***", "R", "***", "FirstPurch") #### Find outliers and set them to missing Note the use of the *anonymous* function in the following code: cbc[outvars] <- data.frame(lapply(cbc[***], function(x) { ifelse((x < 0) | x > (mean(x, na.rm = ***) + 3*sd(***)), NA, x) }))
#### Summary also counts the number of missing values
summary(cbc)
#### Identify variables for which imputation of missing values is desired
missvars <- c(***)
#### Impute missing values of columns with missing values
Here's another *anonymous* function use:
cbc[missvars] <- data.frame(lapply(cbc[***], function(x) {
ifelse(is.***(x), mean(x, na.rm = ***), x) }))
summary(cbc)
#### Delete rows for which there are no books purchased.
cbc_no_zeroes <- cbc[***, ]
summary(cbc_no_zeroes)
nrow(cbc_no_zeroes)
#### Sum the purchases of each book type.
cbc_sums <- cbc %>% summarise(***(c(***:***), sum))
### Histogram plot of numeric variables
library(psych)
multi.hist(cbc[***],nrow=2,ncol=2, global = FALSE)
### Bar plot of book type sums
cbc_pivot <- pivot_longer(cbc_sums, cols=c(***))
names(cbc_pivot) <- c("Type", "Sum")
ggplot(data=cbc_pivot, aes(y=***, x=***)) +
geom_***(stat = 'identity')
### A custom function for calculating 4 moments
library(e1071)
#browser()
calcfourstats <- function(x) {
mu <- round(***), 2)
sigma <- round(***), 2)
skew <- round(***), 3)
kurt <- round(***), 2)
result <- data.frame(mu, sigma, skew, kurt)
results <- calcfourstats(cbc_no_zeroes[, ***])
results <- rbind(results, calcfourstats(cbc_no_zeroes[, ***]))
#browser()
results <- rbind(results, calcfourstats(cbc_no_zeroes[, ***]))
results <- rbind(results, calcfourstats(cbc_no_zeroes[, ***]))
varList <- ***(cbc_no_zeroes[***:***])
print(varList)
rownames(results) <- varList
print(results)
### Creating RFM factors
#### Calculate HML cutoffs for RFM
cbc_rfm <- data.frame(lapply(cbc_no_zeroes[c("R", "F", "M")],
function(x) {
quantile(x, probs = c(***), na.rm = ***) }))
Verify results and test subsetting
cbc_rfm["33%", ***] #What is the 33rd percentile of M?
Create three new variables for HML quantiles of RFM variables
library(dplyr)
cbcRFM <- cbc_no_zeroes %>%
mutate(rRFM = if_else(R <= cbc_rfm[***], "L",
if_else(R >= cbc_rfm[***], “H”, “M”))) ***
mutate(fRFM = if_else(***, “L”,
if_else(F >= ***) %>%
mutate(mRFM = if_else(***, “L”,
if_else(M >= ***, “H”, “M”)))
Convert the new HML variables into ordered factors
cbcRFM[c(“rRFM”, “fRFM”, “mRFM”)] <- data.frame(lapply(cbcRFM[c("rRFM", "fRFM", "mRFM")],
function(x) {
***(x, c("L", "M", "H"), ordered = ***)
head(cbcRFM)
str(cbcRFM)
sumTable <- cbcRFM %>%
group_by(rRFM, fRFM, mRFM) ***
summarize(meanM = round(mean(M), 2))
#### Make three tables, one for each level of factor mRFM
“`{r, echo=TRUE, message=FALSE, warning=FALSE}
for (i in c(“L”, ***, ***)) {
shortTable <- xtabs(meanM ~ rRFM + fRFM, sumTable %>% filter(mRFM == i))
print(paste(‘Monetary Value Segment =’, i))
print(shortTable)
cat(“\n”) # Add a blank line between tables
### Median monetary value per visit by gender
visitValue <- cbcRFM ***
***_by(factor(Gender, labels = c("***", "***"))) %>%
summarize(medianM = round(median(M / F), 2))
visitValue
#### Bubble plot of M by R by Gender
ggplot(cbcRFM, ***(x = R, y = ***, col = factor(***, labels = c(“Female”, “Male”)), size = FirstPurch)) +
geom_point(alpha = .20) +
labs(x = “***”, y = “***”) +
facet_wrap(~ factor(***, labels = c(“***”, “***”)), labeller = label_parsed) +
theme(legend.position = “bottom”, legend.box = “vertical”,
legend.key = element_rect(colour = ‘white’, fill = ‘white’))