knitr::opts_chunk$set(
warning = FALSE,
message = FALSE)
library(tidyverse)
library(dplyr)
library(rvest)
library(purrr)
library(ggplot2)
library(modelr)
library(mgcv)
library(patchwork)
library(viridis)
library(fastDummies)
set.seed(1)
childcare_inspection_df = read_csv("./data/DOHMH_Childcare_Center_Inspections.csv") %>%
janitor::clean_names() %>%
distinct() %>%
select(center_name, borough, zip_code, status, age_range, maximum_capacity,program_type, facility_type,
child_care_type, violation_category,
violation_status,violation_rate_percent:average_critical_violation_rate,regulation_summary,
inspection_summary_result) %>%
drop_na(zip_code, age_range, violation_rate_percent,public_health_hazard_violation_rate, critical_violation_rate) %>%
filter(maximum_capacity != 0) %>%
mutate(
educational_worker_ratio = total_educational_workers/maximum_capacity,
program_type = tolower(program_type),
facility_type = tolower(facility_type),
borough = as.factor(borough),
status = as.factor(status),
program_type = as.factor(program_type),
facility_type = as.factor(facility_type),
child_care_type = as.factor(child_care_type),
age_range = as.factor(age_range)
) %>%
filter(program_type != "school age camp")
center_specific_df = childcare_inspection_df %>%
relocate(center_name, program_type) %>%
group_by(center_name, program_type) %>%
mutate(
n_na = sum(is.na(violation_category)),
n_violation = sum(!is.na(violation_category)),
rate = n_violation/(n_violation + n_na)) %>%
arrange(center_name, program_type)
center_specific_df %>%
ggplot(aes(x = rate, fill = program_type)) +
geom_density(alpha = .5) +
facet_grid(~program_type) +
viridis::scale_fill_viridis(discrete = TRUE) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),plot.title = element_text(hjust = 0.5)) +
labs(
title = "Violation Rate Distribution by Program Type",
x = "Violation Rate",
y = "Rate Density"
)

From the density plot above, we can learn that the “preschool” program type has the highest rate density when the violation rate is about 0.625, which may probably mean that childcare centers with preschool program type have a higher mean violation rate.
center_specific_df %>%
ggplot(aes(x = rate, fill = borough)) +
geom_density(alpha = .5) +
facet_grid(~borough) +
viridis::scale_fill_viridis(discrete = TRUE) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),plot.title = element_text(hjust = 0.5)) +
labs(
title = "Violation Rate Distribution by Borough",
x = "Violation Rate",
y = "Rate Density"
)

From the density plot above, we can learn that Bronx has a higher density of high violation rate compared with other boroughs.