From f3035a2ab4a04783c2506514be310e316bf6afb1 Mon Sep 17 00:00:00 2001 From: Tom Schenk Date: Thu, 11 Sep 2014 20:38:30 -0500 Subject: [PATCH] Initial commit --- .gitignore | 4 + R/Sanitarians.R | 159 +++++++++++++++++++++++++ R/anonymizeInspections.R | 14 +++ R/foodInspectionUtilities.R | 40 +++++++ R/foodInspectionZipCodes.R | 115 ++++++++++++++++++ R/foodInspections.R | 186 +++++++++++++++++++++++++++++ R/foodInspectionsEvaluation.R | 165 ++++++++++++++++++++++++++ R/foodInspectionsModel.R | 88 ++++++++++++++ R/overPlotYearsByMonth.R | 23 ++++ R/readBusinessLicenses.R | 19 +++ R/readChicagoZipCodes.R | 7 ++ R/readFoodInspections.R | 93 +++++++++++++++ R/readFoodInspectionsHistory.R | 15 +++ R/readGarrison.R | 56 +++++++++ R/readGarrisonInspections.R | 67 +++++++++++ R/readGarrisonLicenses.R | 208 +++++++++++++++++++++++++++++++++ R/runCheckZips.R | 8 ++ R/tests/runAllTests.R | 19 +++ R/tests/testAnonymize.R | 16 +++ R/tests/testViolations.R | 26 +++++ R/utilities.R | 157 +++++++++++++++++++++++++ README.md | 17 +++ runCheckZips.bat | 3 + 23 files changed, 1505 insertions(+) create mode 100644 .gitignore create mode 100644 R/Sanitarians.R create mode 100644 R/anonymizeInspections.R create mode 100644 R/foodInspectionUtilities.R create mode 100644 R/foodInspectionZipCodes.R create mode 100644 R/foodInspections.R create mode 100644 R/foodInspectionsEvaluation.R create mode 100644 R/foodInspectionsModel.R create mode 100644 R/overPlotYearsByMonth.R create mode 100644 R/readBusinessLicenses.R create mode 100644 R/readChicagoZipCodes.R create mode 100644 R/readFoodInspections.R create mode 100644 R/readFoodInspectionsHistory.R create mode 100644 R/readGarrison.R create mode 100644 R/readGarrisonInspections.R create mode 100644 R/readGarrisonLicenses.R create mode 100644 R/runCheckZips.R create mode 100644 R/tests/runAllTests.R create mode 100644 R/tests/testAnonymize.R create mode 100644 R/tests/testViolations.R create mode 100644 R/utilities.R create mode 100644 README.md create mode 100644 runCheckZips.bat diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..074e16f --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +/FoodInspectionEvaluation.zip +/.project +/out +/data diff --git a/R/Sanitarians.R b/R/Sanitarians.R new file mode 100644 index 0000000..94160e8 --- /dev/null +++ b/R/Sanitarians.R @@ -0,0 +1,159 @@ +# TITLE: Sanitarians.R +# AUTHOR: Hugh J. Devlin, Ph. D. +# CREATED: 2014-04-17 + +options(warn=1) +options(error=utils::recover) +options(max.print=15000) +options(width=200) + +source("readGarrison.R") +source("readFoodInspectionsHistory.R") +source("overPlotYearsByMonth.R") + +# Matching inspections data +analyzeSanitarians <- function(garrison, portal) { + + # decorate Garrison data frame + garrison$year <- year(garrison$Date) + garrison$month <- month(garrison$Date) + + logMsg(paste('Unique license numbers in Garrison inspections =', length(unique(garrison$License)))) + logMsg(paste('Unique zip codes in Garrison inspections =', length(unique(garrison$Zip)))) + logMsg(paste('Unique sanitarian in Garrison inspections =', length(unique(garrison$Inspector)))) + summarize(garrison) + + logMsg(paste('Unique license numbers in portal inspections =', length(unique(portal$License)))) + logMsg(paste('Unique zip codes in portal inspections =', length(unique(portal$Zip)))) + summarize(portal) + + logMsg('Garrison inspections by sanitarian:') + bySanitarian <- as.matrix(table(garrison$Inspector)) + print(bySanitarian) + print(summary(as.integer(bySanitarian))) + + logMsg('Garrison inspections by year and month:') + byYearMonth <- table(garrison$year, garrison$month) + print(byYearMonth) + dev.new() + overPlotYearsByMonth( + main="Garrison inspections by year and month", + ylab="Inspections", + ylim=range(byYearMonth) + ) + lapply(2011:2014, plotYearByMonth, byYearMonth) + addMonthAxisAndYearsLegend(years=2011:2014) + dev.copy(svg, filename="../out/GarrisonInspectionsByYearAndMonth.svg") + dev.off() + + duplicatedGarrison <- duplicated(garrison[ , c("License", "Date")]) + logMsg(paste("Removing duplicate inspections from Garrison:", sum(duplicatedGarrison))) + garrison <- garrison[!duplicatedGarrison , ] + + duplicatedPortal <- duplicated(portal[ , c("License", "Inspection.Date")]) + logMsg(paste("Removing duplicate inspections from portal:", sum(duplicatedPortal))) + portal <- portal[!duplicatedPortal , ] + + df <- merge( + subset(garrison, select=-c(Address)), + subset(portal, select=-c(Zip, DBA.Name, AKA.Name, Inspection.Type, City, State, Latitude, Longitude, Location)), + by.x=c("License", "Date"), by.y=c("License", "Inspection.Date")) + duplicatedDf <- duplicated(df[ , c("License", "Date")]) + logMsg(paste("Removing duplicate inspections:", sum(duplicatedDf))) + df <- df[!duplicatedDf , ] + logMsg(paste('Unique license numbers in inspections =', length(unique(df$License)))) + logMsg(paste('Unique zip codes in inspections =', length(unique(df$Zip)))) + logMsg(paste('Unique sanitarians in inspections =', length(unique(df$Inspector)))) + summarize(df) + + logMsg('Inspections by sanitarian and year:') + bySanitarianAndYear <- table(df$Inspector, df$year) + print(bySanitarianAndYear) + dev.new() + par(las=1) # horizontal axis labels + stripchart(values ~ ind, + data=stack(as.data.frame.matrix(bySanitarianAndYear)), + pch=20, + main="Inspections by sanitarian and year", + xlab="Year", + ylab="Inspections", + vertical=TRUE, + col="blue" + ) + apply(bySanitarianAndYear, 1, lines, col="blue") + dev.copy(svg, filename="../out/InspectionsBySanitarianAndYear.svg") + dev.off() + + logMsg('Fails by sanitarian and year:') + fails <- df[df$Results == 'Fail' , ] + failsBySanitarianAndYear <- table(fails$Inspector, fails$year) + print(failsBySanitarianAndYear) + dev.new() + par(las=1) # horizontal axis labels + stripchart(values ~ ind, + data=stack(as.data.frame.matrix(failsBySanitarianAndYear)), + pch=20, + main="Fails by sanitarian and year", + xlab="Year", + ylab="Fails", + vertical=TRUE, + col="blue" + ) + apply(failsBySanitarianAndYear, 1, lines, col="blue") + dev.copy(svg, filename="../out/FailsBySanitarianAndYear.svg") + dev.off() + + logMsg('Fail counts correlation by sanitarian from year-to-year:') + print(cor(failsBySanitarianAndYear, use="pairwise.complete.obs")) + logMsg('Fail counts by sanitarian correlation 2012 to 2013 with confidence interval and p-value:') + print(cor.test(failsBySanitarianAndYear[ , "2013"], failsBySanitarianAndYear[ , "2014"], use="pairwise.complete.obs")) + + logMsg('Failure rates by sanitarian and year:') + failureRateBySanitarianAndYear <- failsBySanitarianAndYear / bySanitarianAndYear + print(failureRateBySanitarianAndYear) + dev.new() + par(las=1) # horizontal axis labels + stripchart(values ~ ind, + data=stack(as.data.frame.matrix(failureRateBySanitarianAndYear)), + pch=20, + main="Failure rates by sanitarian and year", + xlab="Year", + ylab="Failure rate", + vertical=TRUE, + col="blue" + ) + apply(failureRateBySanitarianAndYear, 1, lines, col="blue") + dev.copy(svg, filename="../out/FailureRatesBySanitarianAndYear.svg") + dev.off() + + logMsg('Failure rate correlation by sanitarian from year-to-year:') + print(cor(failureRateBySanitarianAndYear, use="pairwise.complete.obs")) + logMsg('Failure rate by sanitarian correlation 2013 to 2014 with confidence interval and p-value:') + print(cor.test(failureRateBySanitarianAndYear[ , "2013"], failureRateBySanitarianAndYear[ , "2014"], use="pairwise.complete.obs")) + + sanitarians <- data.frame( + Inspections2011to2013=rowSums(bySanitarianAndYear[, c("2011", "2012", "2013")]), + Fails2011to2013=rowSums(failsBySanitarianAndYear[, c("2011", "2012", "2013")]), + Inspections2014=bySanitarianAndYear[,"2014"], + Fails2014=failsBySanitarianAndYear[,"2014"] + ) + sanitarians$Rate2011to2013 <- sanitarians$Fails2011to2013 / sanitarians$Inspections2011to2013 + sanitarians$Rate2014 <- sanitarians$Fails2014 / sanitarians$Inspections2014 + print(sanitarians) + logMsg('Failure rate by sanitarian correlation 2013 to 2014 with confidence interval and p-value:') + print(cor.test(sanitarians$Rate2011to2013, sanitarians$Rate2014, use="pairwise.complete.obs")) + + logMsg('Done.') +} + +run <- function() { + logMsg('Reading Garrison food inspection history') + garrison <- cleanGarrison(readGarrison()) + logMsg(paste('Garrison food inspection history records:', nrow(garrison))) + + logMsg('Reading portal food inspection history') + portal <- readFoodInspectionsHistory() + logMsg(paste('Portal food inspection history records:', nrow(portal))) + + analyzeSanitarians(garrison, portal) +} \ No newline at end of file diff --git a/R/anonymizeInspections.R b/R/anonymizeInspections.R new file mode 100644 index 0000000..1435d9b --- /dev/null +++ b/R/anonymizeInspections.R @@ -0,0 +1,14 @@ + +source("utilities.R") +source("readGarrison.R") + +run <- function(file="../out/AnonymizedInspectionsGarrisonExport20112014.csv") { + logMsg('Reading Garrison food inspection history') + garrison <- readGarrison() + logMsg(paste('Garrison food inspection history records:', nrow(garrison))) + logMsg('Anonymizing Garrison food inspection history records...') + garrison <- anonymizeColumns(garrison, c("Inspector")) + logMsg('Writing Garrison food inspection history') + write.table(garrison, file=file, sep=',', row.names=FALSE) + logMsg('Done.') +} \ No newline at end of file diff --git a/R/foodInspectionUtilities.R b/R/foodInspectionUtilities.R new file mode 100644 index 0000000..1155b6b --- /dev/null +++ b/R/foodInspectionUtilities.R @@ -0,0 +1,40 @@ +# Food inspection utilities +# +# Author: 368982 +############################################################################### + +# split on vertical bar, then on period after the violation code number, keep the code numbers +parseViolationCodes <- function(violations) { + lapply(lapply(strsplit(violations, "| ", fixed=TRUE), strsplit, ". ", fixed=TRUE), function(x) as.integer(lapply(x, FUN='[', 1))) +} + +# critical=1, serious=2, minor=3, other=0 +classifyViolations <- function(violations) { + lapply(parseViolationCodes(violations), function(x) ifelse(x %in% 1:14, 1, ifelse(x %in% 15:29, 2, ifelse(x %in% 30:44, 3, 0)))) +} + +# counts by criticality +# tabulate ignores zeroes +# returns an integer matrix with a row for each inspection, and 3 columns for critical, serious, and minor counts +violationCounts <- function(violations) { + result <- do.call(rbind, lapply(classifyViolations(violations), function(x) tabulate(as.integer(x), nbins=3))) + colnames(result) <- c("critical", "serious", "minor") + result +} + +# add violation counts to inspections data frame +countViolations <- function(inspections) { + result <- inspections + violationCounts <- violationCounts(inspections$Violations) + + result$criticalCount <- violationCounts[ , "critical"] + result$seriousCount <- violationCounts[ , "serious"] + result$minorCount <- violationCounts[ , "minor"] + + result$serious <- (result$seriousCount > 0) + result$critical <- (result$criticalCount > 0) + result$minor <- (result$minorCount > 0) + + result +} + diff --git a/R/foodInspectionZipCodes.R b/R/foodInspectionZipCodes.R new file mode 100644 index 0000000..fbf1cef --- /dev/null +++ b/R/foodInspectionZipCodes.R @@ -0,0 +1,115 @@ +# TITLE: foodInspectionZipCodes.R +# AUTHOR: Hugh J. Devlin, Ph. D. +# CREATED: 2014-04-07 + +options(warn=1) +options(error=utils::recover) +options(max.print=15000) +options(width=200) + +source("utilities.R") + +posixifyColumns <- function(df, columnNames) { + result <- df + for(columnName in columnNames) { + result[[columnName]] <- as.POSIXct(result[[columnName]], format="%m/%d/%Y") + } + result +} + +readInspectionHistory <- function(file="../data/Food_Inspections.csv") { + result <- read.csv(file, stringsAsFactors=FALSE, na.strings='') + names(result)[names(result)=="License.."] <- "License" # "License #" on the portal, rename merge column + result$Inspection.Date <- as.POSIXct(result$Inspection.Date, format="%m/%d/%Y") + result +} + +readLicenses <- function(file="../data/Business_Licenses.csv") { + result <- read.csv(file, stringsAsFactors=FALSE, na.strings='') + result <- posixifyColumns(result, + c( "APPLICATION.CREATED.DATE", + "APPLICATION.REQUIREMENTS.COMPLETE", + "PAYMENT.DATE", + "LICENSE.TERM.START.DATE", + "LICENSE.TERM.EXPIRATION.DATE", + "LICENSE.APPROVED.FOR.ISSUANCE", + "DATE.ISSUED", + "LICENSE.STATUS.CHANGE.DATE" + )) + result +} + +# Matching inspections data +compareZips <- function(inspections, licenses, startDate="2014-01-01", + licensesNotFoundPath='../out/licensesNotFound.csv', + zipCodeDiscrepencyPath='../out/foodInspectionZipCodeDiscrepencies.csv') { + + #subset food inspections by start date + inspections <- subset(inspections, Inspection.Date >= as.POSIXct(startDate, format="%Y-%m-%d")) + inspections <- inspections[complete.cases(inspections[ , c("License")]) , ] + logMsg(paste('Food inspections after', startDate, ':', nrow(inspections))) + summarize(inspections) + + inspectionLicenseNumbers <- unique(inspections$License) + logMsg(paste('Unique license numbers in food inspections =', length(inspectionLicenseNumbers))) + logMsg(paste('Unique zip codes in food inspections =', length(unique(inspections$Zip)))) + logMsg('Food inspections by zip code:') + print(sort(table(inspections$Zip), decreasing=TRUE)) + + # subset to currently active business licenses + licenses <- licenses[complete.cases(licenses[ , c("ACCOUNT.NUMBER", "LICENSE.NUMBER", "LICENSE.CODE", "LICENSE.TERM.START.DATE", "LICENSE.TERM.EXPIRATION.DATE")]) , ] + licenses <- licenses[licenses$LICENSE.STATUS == "AAI" , ] + licenses <- licenses[licenses$LICENSE.TERM.START.DATE <= Sys.time() & Sys.time() <= licenses$LICENSE.TERM.EXPIRATION.DATE , ] + # Keep only the most recent of duplicates + licenses <- licenses[with(licenses, order(ACCOUNT.NUMBER, LICENSE.NUMBER, LICENSE.CODE, LICENSE.TERM.START.DATE)) , ] # sort + licenses <- licenses[!duplicated(licenses$LICENSE.NUMBER, fromLast=TRUE) , ] + summarize(licenses) + + licenseNumbers <- unique(licenses$LICENSE.NUMBER) + logMsg(paste('Unique license numbers in business licenses =', length(licenseNumbers))) + logMsg(paste('Unique zip codes in business licenses =', length(unique(licenses$ZIP.CODE)))) + logMsg('Licenses by zip code:') + byZipCode <- table(licenses$ZIP.CODE) + print(sort(byZipCode, decreasing=TRUE)) + + duplicates <- duplicated(licenses$LICENSE.NUMBER, fromLast=FALSE) | duplicated(licenses$LICENSE.NUMBER, fromLast=TRUE) + logMsg(paste('Duplicate active license numbers in business licenses:', sum(duplicates))) + + logMsg(paste('Unique license numbers in business licenses =', length(unique(licenses$LICENSE.NUMBER)))) + + # inspections not found in business licenses + inspectionsNotFound <- inspections[!(inspections$License %in% licenseNumbers) , c("License" , "DBA.Name", "Address", "Inspection.Date", "Results")] + inspectionsNotFound <- inspectionsNotFound[with(inspectionsNotFound, order(License, Inspection.Date)) , ] # sort + inspectionsNotFound <- inspectionsNotFound[!duplicated(inspectionsNotFound$License, fromLast=TRUE) , ] # keep only most recent + inspectionsNotFound <- inspectionsNotFound[inspectionsNotFound$Result != "Business Not Located", ] + inspectionsNotFound <- inspectionsNotFound[inspectionsNotFound$Result != "Out of Business", ] + logMsg(paste('License numbers in inspections not found in business licenses =', nrow(inspectionsNotFound))) + write.table(inspectionsNotFound, file=licensesNotFoundPath, sep=',', row.names=FALSE) + + # zip code discrepencies between the inspections and the business licenses + inspections <- inspections[complete.cases(inspections[ , c("Zip")]) , ] + logMsg(paste('Inspections with zip codes =', nrow(inspections))) + licenses <- licenses[complete.cases(licenses[ , c("ZIP.CODE")]) , ] + logMsg(paste('Business licenses with zip codes =', nrow(licenses))) + inspections <- merge(inspections, licenses, by.x="License", by.y="LICENSE.NUMBER") + logMsg(paste('Inspections matching business license on license number =', nrow(inspections))) + inspections <- inspections[(inspections$Zip != inspections$ZIP.CODE) , ] + logMsg(paste('Inspections with zip codes different from business license =', nrow(inspections))) + inspections <- inspections[ , c("License", "Inspection.Date", "Results", "DBA.Name", "Address", "Zip", "ZIP.CODE")] + names(inspections)[names(inspections)=="Zip"] <- "Food Inspection Zip" + names(inspections)[names(inspections)=="ZIP.CODE"] <- "Business License Zip" + inspections <- inspections[with(inspections, order(License, Inspection.Date)) , ] # sort + write.table(inspections, file=zipCodeDiscrepencyPath, sep=',', row.names=FALSE) + + logMsg('Done.') +} + +run <- function() { + logMsg('Reading food inspections') + fi <- readInspectionHistory() + logMsg(paste('Food inspections:', nrow(fi))) + logMsg('Reading licenses') + li <- readLicenses() + logMsg(paste('Licenses:', nrow(li))) + compareZips(fi, li) +} \ No newline at end of file diff --git a/R/foodInspections.R b/R/foodInspections.R new file mode 100644 index 0000000..06824ca --- /dev/null +++ b/R/foodInspections.R @@ -0,0 +1,186 @@ +# TITLE: foodInspections.R +# AUTHOR: Hugh J. Devlin, Ph. D. +# CREATED: 2014-04-08 + +options(warn=1) +options(error=utils::recover) +options(max.print=15000) +options(width=200) + +source("utilities.R") + +readInspections <- function(file="../data/Food_Inspections.csv") { + result <- read.csv(file, stringsAsFactors=FALSE, na.strings='') + names(result)[names(result)=="License.."] <- "License" # "License #" on the portal + result$Inspection.Date <- as.POSIXct(result$Inspection.Date, format="%m/%d/%Y") + result +} + +# last digit of year as plot character +yearPlotCharacter <- function(year) { + return(strtoi(charToRaw("0"), 16L) + as.integer(year) - 2010) +} + +plotYearByMonth <- function(year, df) { + yearName = as.character(year) + lines(1:12, df[yearName , ]) + points(1:12, df[yearName , ], pch=yearPlotCharacter(year)) +} + +addMonthAxisAndYearsLegend <- function(location='bottomright', years=2010:2013) { + axis(side=1, at=1:12, labels=month(1:12 , label=TRUE, abbr=TRUE)) + legend(location, legend=years, pch=yearPlotCharacter(years), title="Years") +} + +overPlotYearsByMonth <- function(...) { + plot(1:12, type="n", xlab="Month", xaxt="n", ...) +} + +# Matching inspections data +analyzeInspections <- function(inspections) { + RESULT_COLORS <- c("white", "red", "grey", "black", "green", "yellow") + + invalidZip <- (inspections$Zip == 60627) + logMsg(paste('Removing invalid zip 60627; invalid rows =', sum(invalidZip, na.rm=TRUE))) + inspections <- inspections[!invalidZip , ] + + # decorate data frame + inspections$year <- year(inspections$Inspection.Date) + inspections$month <- month(inspections$Inspection.Date) + summarize(inspections) + + logMsg(paste('Unique license numbers in food inspections =', length(unique(inspections$License)))) + logMsg(paste('Unique zip codes in food inspections =', length(unique(inspections$Zip)))) + + logMsg('Chicago food inspections by zip code and result:') + inspectionsChicago <- inspections[60600 < inspections$Zip & inspections$Zip < 60700 , ] + inspectionsChicago$Zip <- factor(inspectionsChicago$Zip - 60600) + byZipAndResult <- table(inspectionsChicago$Zip, inspectionsChicago$Results) + print(byZipAndResult) + dev.new() + barplot(t(byZipAndResult), + main="Chicago food inspections by zip code and result (2010-present)", + xlab="Zip code 606xx", + ylab="Chicago food inspections", + legend.text=colnames(byZipAndResult), + args.legend=list(x="topright", title="Result"), + col=RESULT_COLORS + ) + dev.copy(svg, filename="../out/ChicagoFoodInspectionsByZipCodeAndResult.svg") + dev.off() + + logMsg('Food inspections by year and result:') + byYearAndResult <- table(inspections$Results, inspections$year) + print(byYearAndResult) + dev.new() + barplot(byYearAndResult, + main="Food inspections by year and result", + xlab="Year", + ylab="Food inspections", + legend.text=rownames(byYearAndResult), + args.legend=list(x="right", title="Result"), + col=RESULT_COLORS + ) + dev.copy(svg, filename="../out/FoodInspectionsByYearAndResult.svg") + dev.off() + + logMsg('Food inspections by year and month:') + byYearMonth <- table(inspections$year, inspections$month) + print(byYearMonth) + dev.new() + overPlotYearsByMonth( + main="Food inspections by year and month", + ylab="Food inspections", + ylim=range(byYearMonth) + ) + lapply(2010:2013, plotYearByMonth, byYearMonth) + addMonthAxisAndYearsLegend() + dev.copy(svg, filename="../out/FoodInspectionsByYearAndMonth.svg") + dev.off() + + logMsg('Food inspection failures by year and month:') + fails <- inspections[inspections$Results == 'Fail' , ] + failsByYearMonth <- table(fails$year, fails$month) + print(failsByYearMonth) + dev.new() + overPlotYearsByMonth( + main="Food inspection failures by year and month", + ylab="Food inspection failures", + ylim=range(failsByYearMonth) + ) + lapply(2010:2013, plotYearByMonth, failsByYearMonth) + addMonthAxisAndYearsLegend() + dev.copy(svg, filename="../out/FoodInspectionsFailuresByYearAndMonth.svg") + dev.off() + + logMsg('Food inspection failure rate by year and month:') + failureRateByYearMonth <- failsByYearMonth / byYearMonth + print(failureRateByYearMonth) + dev.new() + overPlotYearsByMonth( + main="Food inspection failure rate by year and month", + ylab="Food inspection failure rate", + ylim=c(0, 0.4) + ) + lapply(2010:2013, plotYearByMonth, failureRateByYearMonth) + addMonthAxisAndYearsLegend('topright') + dev.copy(svg, filename="../out/FoodInspectionsFailureRateByYearAndMonth.svg") + dev.off() + + logMsg('Chicago food inspection failures by zip code and year:') + failsChicago <- inspectionsChicago[inspectionsChicago$Results == 'Fail' , ] + failsChicagoByZipAndYear <- table(failsChicago$Zip, failsChicago$year) + print(failsChicagoByZipAndYear) + dev.new() + par(las=1) # horizontal axis labels + stripchart(values ~ ind, + data=stack(as.data.frame.matrix(failsChicagoByZipAndYear)), + pch=20, + main="Chicago food inspection failures by zip and year", + xlab="Year", + ylab="Food inspection failures", + vertical=TRUE, + col="blue" + ) + apply(failsChicagoByZipAndYear, 1, lines, col="blue") + dev.copy(svg, filename="../out/ChicagoFoodInspectionsFailuresByZipAndYear.svg") + dev.off() + + logMsg('Chicago food inspection failure counts correlation by zip code from year-to-year:') + print(cor(failsChicagoByZipAndYear)) + logMsg('Chicago food inspection failure counts by zip code correlation 2012 to 2013 with confidence interval and p-value:') + print(cor.test(failsChicagoByZipAndYear[ , "2013"], failsChicagoByZipAndYear[ , "2014"])) + + logMsg('Chicago food inspection failure rates by zip code and year:') + inspectionsChicagoByZipAndYear <- table(inspectionsChicago$Zip, inspectionsChicago$year) + failureRateChicagoByZipAndYear <- failsChicagoByZipAndYear / inspectionsChicagoByZipAndYear + print(failureRateChicagoByZipAndYear) + dev.new() + par(las=1) # horizontal axis labels + stripchart(values ~ ind, + data=stack(as.data.frame.matrix(failureRateChicagoByZipAndYear)), + pch=20, + main="Chicago food inspection failure rates by zip and year", + xlab="Year", + ylab="Food inspection failure rate", + vertical=TRUE, + col="blue" + ) + apply(failureRateChicagoByZipAndYear, 1, lines, col="blue") + dev.copy(svg, filename="../out/ChicagoFoodInspectionsFailureRatesByZipAndYear.svg") + dev.off() + + logMsg('Chicago food inspection failure rate correlation by zip code from year-to-year:') + print(cor(failureRateChicagoByZipAndYear)) + logMsg('Chicago food inspection failure rate by zip code correlation 2013 to 2014 with confidence interval and p-value:') + print(cor.test(failureRateChicagoByZipAndYear[ , "2013"], failureRateChicagoByZipAndYear[ , "2014"], use="complete.obs")) + + logMsg('Done.') +} + +run <- function() { + logMsg('Reading food inspections') + fi <- readInspections() + logMsg(paste('Food inspections:', nrow(fi))) + analyzeInspections(fi) +} \ No newline at end of file diff --git a/R/foodInspectionsEvaluation.R b/R/foodInspectionsEvaluation.R new file mode 100644 index 0000000..3c2ea24 --- /dev/null +++ b/R/foodInspectionsEvaluation.R @@ -0,0 +1,165 @@ +# TITLE: foodInspectionsEvaluation.R +# AUTHOR: Tom Schenk Jr. and Hugh J. Devlin, Ph. D. +# CREATED: 2014-02-24 +# MODIFIED: 2014-02-27, 2014-04-01 +# NOTES: To execute, run 'Rscript foodInspectionsEvaluation.R' from terminal or command prompt. + +options(warn=1) +options(error=utils::recover) +options(max.print=2000) +options(width=150) + +library(stringr) + +source("utilities.R") +source("foodInspectionUtilities.R") + +readFoodInspections <- function(file="http://data.cityofchicago.org/api/views/4ijn-s7e5/rows.csv") { + result <- read.csv(file, stringsAsFactors=FALSE, na.strings='') + names(result)[names(result)=="License.."] <- "License" # rename column + result$Inspection.Date <- as.POSIXct(result$Inspection.Date, format="%m/%d/%Y") + result <- factorizeColumns(result, c("Facility.Type", "Inspection.Type", "City", "State", "Risk", "Results")) + result +} + +readPredictions <- function(file="../data/InspectionList.csv") { + result <- read.csv(file, stringsAsFactors=FALSE) + names(result)[names(result)=="license_number"] <- "License" # rename merge column + result <- factorizeColumns(result, c("risk", "type", "supervisor", "sanitarian")) + result +} + +zDifferenceOfProportions = function(x1, x2, n1, n2) { + numerator = (x1/n1) - (x2/n2) + p.common = (x1 + x2) / (n1 + n2) + denominator = sqrt(p.common * (1 - p.common) * (1/n1 + 1/n2)) + return(numerator / denominator) +} + +testPredictions <- function(critical, groups) { + # Create tables of outcomes (2x3 non-critical/critical, both/control/test) + critical.outcomes <- table(critical, groups) + logMsg("Critical outcomes for treatment and control group:") + print(critical.outcomes) + logMsg("Critical outcomes for treatment and control group (percentages)") + critical.outcomes.prop <- prop.table(critical.outcomes, 2) + print(critical.outcomes.prop) + + nBoth <- sum(critical.outcomes[ , "both"]) + nControl <- nBoth + sum(critical.outcomes[ , "control"]) + nTest <- nBoth + sum(critical.outcomes[ , "test"]) + + # The row name for critical violations is "TRUE" + xBoth <- critical.outcomes["TRUE", "both"] + xControl <- xBoth + critical.outcomes["TRUE", "control"] + xTest <- xBoth + critical.outcomes["TRUE", "test"] + + pControl <- xControl / nControl + pTest <- xTest / nTest + + logMsg(paste("Proportion criticals in test (test + both) group =", pTest)) + logMsg(paste("Proportion criticals in control (control + both) group =", pControl)) + logMsg(paste("difference in proportions =", (pTest - pControl))) + result <- zDifferenceOfProportions(xTest, xControl, nTest, nControl) + logMsg(paste("z score for test of proportions =", result)) + + contingencyTable <- matrix(c((nControl-xControl), (nTest-xTest), xControl, xTest), nrow=2, byrow=TRUE, + dimnames=list(c("Non-critical", "Critical"), c("Control", "Test"))) + logMsg('Contingency table:') + print(contingencyTable) + print(chisq.test(contingencyTable)) + print(prop.test(contingencyTable["Critical",], colSums(contingencyTable))) + print(fisher.test(contingencyTable)) + NULL +} + +# Matching inspections data +evaluateInspections <- function(inspections, predictions, startDate="2014-01-01") { + + # food inspections history from portal + logMsg(paste('Food inspections:', nrow(inspections))) + + #subset + inspections <- subset(inspections, Inspection.Date >= as.POSIXct(startDate, format="%Y-%m-%d")) + logMsg(paste('Food inspections after', startDate, ':', nrow(inspections))) + logMsg('Keep just the first of multiple inspections') + inspections <- inspections[with(inspections, order(License, Inspection.Date)) , ] # sort by license number and day + inspections <- inspections[!duplicated(inspections$License) , ] # remove 2nd and subsequent occurrances of a license number + logMsg(paste('Unique licenses in inspections', length(unique(inspections$License)))) + summarize(inspections) + + # predictions from model + logMsg(paste('Predictions', nrow(predictions))) + logMsg(paste('Unique licenses in predictions', length(unique(predictions$License)))) + logMsg('Predictions by group:') + print(table(predictions$type)) + logMsg('Predictions by zip code:') + print(sort(table(predictions$zip_code), decreasing=TRUE)) + summarize(predictions) + + # merge predictions and history + inspections <- merge(predictions, inspections) + logMsg(paste('Rows after merge of predicitons and inspections:', nrow(inspections))) + logMsg(paste('Unique licenses after merge', length(unique(inspections$License)))) + + # Find unmatched data that is in the predictions list but not in food inspection data + logMsg('Finding uninspected predictions') + predictionsNotInspected <- predictions[!(predictions$License %in% inspections$License) , ] + logMsg(paste("Predictions without inspections:", nrow(predictionsNotInspected))) + print(predictionsNotInspected[ , c("License", "zip_code", "type", "doing_business_as_name")]) + logMsg('Uninspected predictions by group:') + print(table(predictionsNotInspected$type)) + + # Remove non-inspection inspections from experiment + nonInspections <- inspections[inspections$Result %in% c("No Entry", "Out of Business", "Business Not Located") , ] + logMsg(paste('No inspection due to No Entry, Out of Business, or Business Not Located =', nrow(nonInspections))) + print(nonInspections[ , c("License", "Zip", "Inspection.Date", "type", "Results", "doing_business_as_name")]) + logMsg('Non-inspections by group') + print(table(nonInspections$type)) + logMsg('Removing non-inspections due to No Entry, Out of Business, or Business Not Located') + inspections <- inspections[!(inspections$License) %in% nonInspections$License , ] + summarize(inspections) + + logMsg('Differences in risk classification (rows are history data, columns are predictions data):') + print(table(inspections$Risk, inspections$risk)) + + zipCodeMismatches <- inspections[!is.na(inspections$Zip) & inspections$Zip!=inspections$zip_code , ] + logMsg(paste('Differences in zip code (DBA.Name, Zip And Address are from history data, doing_business_as_name, zip_code and address are from predictions data):', nrow(zipCodeMismatches))) + print(zipCodeMismatches[ , c("License", "DBA.Name", "Address", "Zip", "doing_business_as_name", "address", "zip_code", "type")]) + logMsg('Differences in zip code by group') + print(table(zipCodeMismatches$type)) + + logMsg('Inspections by type and group:') + byType <- table(inspections$Inspection.Type, inspections$type) + print(byType[byType[ , "both"] > 0 | byType[ , "control"] > 0 | byType[ , "test"] > 0 , ]) + + logMsg('Inspections by result and group:') + byResult <- table(inspections$Results, inspections$type) + print(byResult[byResult[ , "both"] > 0 | byResult[ , "control"] > 0 | byResult[ , "test"] > 0 , ]) + + # Parse and categorize inspections by criticality level + logMsg("Adding counts for critical, serious, and minor violations") + inspections <- countViolations(inspections) + logMsg("Finished adding counts for critical, serious, and minor violations") + + logMsg("Sanitarians by number of inspections, critical outcomes, and percent critical outcomes") + bySanitarian <- cbind(inspections=table(inspections$sanitarian), fails=table(inspections$sanitarian[inspections$critical])) + bySanitarian <- cbind(bySanitarian, failureRate=(bySanitarian[ , "fails"] / bySanitarian[ , "inspections"])) + bySanitarian <- merge(bySanitarian, aggregate(prediction ~ sanitarian, inspections, sum), by.x="row.names", by.y="sanitarian") + bySanitarian$prediction <- bySanitarian$prediction / bySanitarian$inspections + names(bySanitarian)[names(bySanitarian)=="Row.names"] <- "Sanitarian" # rename column + names(bySanitarian)[names(bySanitarian)=="prediction"] <- "avgPrediction" # rename column + bySanitarian$conditionalFailureRate <- bySanitarian$failureRate * bySanitarian$avgPrediction + bySanitarian <- bySanitarian[order(-bySanitarian[ , "failureRate"]) , ] # sort decending failure rate + print(bySanitarian) + + testPredictions(inspections$critical, inspections$type) + + NULL +} + +run <- function() { + fi <- readFoodInspections() + il <- readPredictions() + evaluateInspections(fi, il) +} \ No newline at end of file diff --git a/R/foodInspectionsModel.R b/R/foodInspectionsModel.R new file mode 100644 index 0000000..0086d7d --- /dev/null +++ b/R/foodInspectionsModel.R @@ -0,0 +1,88 @@ +# Food Inspections prediciton Model, Mark II +# +# Author: 368982 Hugh 4/22/14 +############################################################################### + +options(warn=1) +options(error=utils::recover) +options(max.print=15000) +options(width=300) + +source("utilities.R") +source("foodInspectionUtilities.R") +source("readChicagoZipCodes.R") +source("readGarrisonLicenses.R") +source("readFoodInspections.R") + +run <- function() { + + logMsg('Reading Chicago ZIP codes') + validZips <- readChicagoZipCodes("../data/ChicagoZipCodes.csv") + logMsg(paste('Chicago ZIP codes:', nrow(validZips))) + + logMsg('Reading Garrison licenses') + licenses <- readGarrisonLicenses("../data/GarrisonLicenses.csv") + logMsg(paste('Garrison licenses records:', nrow(licenses))) + + logMsg('Cleaning Garrison licenses') + licenses <- cleanGarrisonLicenses(licenses, validZips$Zip) + logMsg(paste('Garrison licenses records:', nrow(licenses))) + + logMsg('Removing special cases from Garrison licenses') + licenses <- specialCaseEliminations(licenses) + logMsg(paste('Garrison licenses records:', nrow(licenses))) + + logMsg('Reading food inspection records') + inspections <- readFoodInspections("../data/Food_Inspections.csv") + logMsg(paste('Food inspection records:', nrow(inspections))) + + logMsg('Cleaning food inspection records') + inspections <- cleanFoodInspections(inspections, validZips$Zip) + logMsg(paste('Food inspection records:', nrow(inspections))) + + logMsg('Subset inspections by date') + inspections <- inspections[as.POSIXct("2011-01-01") < inspections$Date & inspections$Date < as.POSIXct("2014-01-01") , ] + logMsg(paste('Food inspection records:', nrow(inspections))) + + # Parse and categorize inspections by criticality level + logMsg("Adding counts for critical, serious, and minor violations to inspections") + inspections <- countViolations(inspections) + logMsg("Finished adding counts for critical, serious, and minor violations to inspections") + + # Decorate licenses with fail counts and violation counts from inspections + logMsg("Adding counts for critical, serious, and minor violations to licenses") + # remove uninformative inspections + inspections <- inspections[!(inspections$Result %in% c("Business Not Located", "No Entry", "Out of Business")) , ] + inspections$Fail <- (inspections$Results == "Fail") +# failCount <- aggregate(Results ~ License, inspections, function(x) sum(x=="Fail")) + failCount <- aggregate(Fail ~ License, inspections, sum) + colnames(failCount)[colnames(failCount) == 'Fail'] <- 'failCount' + licenses <- merge(licenses, failCount, all.x=TRUE) + licenses <- merge(licenses, aggregate(criticalCount ~ License, inspections, sum), all.x=TRUE) + licenses <- merge(licenses, aggregate(seriousCount ~ License, inspections, sum), all.x=TRUE) + licenses <- merge(licenses, aggregate(minorCount ~ License, inspections, sum), all.x=TRUE) + licenses$violationCount <- licenses$criticalCount + licenses$seriousCount + licenses$minorCount + logMsg("Finished adding counts for critical, serious, and minor violations to licenses") + print(cor(licenses[ , c("failCount", "criticalCount", "seriousCount", "minorCount", "violationCount")], use="pairwise.complete.obs")) + plot(licenses$minorCount, licenses$violationCount) + + # Fit model + ds <- inspections[, c("License", "Fail")] + # Decorate inspections with licensee attributes + ds <- merge(ds, licenses[ , c("License", "failCount", "criticalCount", "seriousCount", "minorCount", "violationCount")]) + summarize(ds) + foodInspectionFailureModel <- glm(Fail ~ failCount + criticalCount + seriousCount + minorCount, ds, family=binomial) + print(summary(foodInspectionFailureModel)) + print(summary(confint(foodInspectionFailureModel))) + ds <<- ds # copy to global for interactive use + plot(foodInspectionFailureModel) + + logMsg('Saving inspection data') + write.table(inspections, file="../out/FoodInspections.csv", sep=',', row.names=FALSE) + inspections <<- inspections # copy to global for interactive use + + logMsg('Saving license data') + write.table(licenses, file="../out/GarrisonLicenses.csv", sep=',', row.names=FALSE) + licenses <<- licenses # copy to global for interactive use + + } \ No newline at end of file diff --git a/R/overPlotYearsByMonth.R b/R/overPlotYearsByMonth.R new file mode 100644 index 0000000..d759f0f --- /dev/null +++ b/R/overPlotYearsByMonth.R @@ -0,0 +1,23 @@ +# TITLE: Sanitarians.R +# AUTHOR: Hugh J. Devlin, Ph. D. +# CREATED: 2014-04-17 + +# last digit of year as plot character +yearPlotCharacter <- function(year) { + return(strtoi(charToRaw("0"), 16L) + as.integer(year) - 2010) +} + +plotYearByMonth <- function(year, df) { + yearName = as.character(year) + lines(1:12, df[yearName , ]) + points(1:12, df[yearName , ], pch=yearPlotCharacter(year)) +} + +addMonthAxisAndYearsLegend <- function(location='bottomright', years) { + axis(side=1, at=1:12, labels=month(1:12 , label=TRUE, abbr=TRUE)) + legend(location, legend=years, pch=yearPlotCharacter(years), title="Years") +} + +overPlotYearsByMonth <- function(...) { + plot(1:12, type="n", xlab="Month", xaxt="n", ...) +} diff --git a/R/readBusinessLicenses.R b/R/readBusinessLicenses.R new file mode 100644 index 0000000..a8d57d6 --- /dev/null +++ b/R/readBusinessLicenses.R @@ -0,0 +1,19 @@ +# AUTHOR: Hugh J. Devlin, Ph. D. +# CREATED: 2014-04-07 + +source("utilities.R") + +readLicenses <- function(file) { + result <- read.csv(file, stringsAsFactors=FALSE, na.strings='') + result <- posixifyColumns(result, + c( "APPLICATION.CREATED.DATE", + "APPLICATION.REQUIREMENTS.COMPLETE", + "PAYMENT.DATE", + "LICENSE.TERM.START.DATE", + "LICENSE.TERM.EXPIRATION.DATE", + "LICENSE.APPROVED.FOR.ISSUANCE", + "DATE.ISSUED", + "LICENSE.STATUS.CHANGE.DATE" + )) + result +} diff --git a/R/readChicagoZipCodes.R b/R/readChicagoZipCodes.R new file mode 100644 index 0000000..2851519 --- /dev/null +++ b/R/readChicagoZipCodes.R @@ -0,0 +1,7 @@ +# AUTHOR: Hugh J. Devlin, Ph. D. +# CREATED: 2014-04-24 + +readChicagoZipCodes <- function(file="../data/ChicagoZipCodes.csv") { + result <- read.csv(file, stringsAsFactors=FALSE, na.strings='') + result +} diff --git a/R/readFoodInspections.R b/R/readFoodInspections.R new file mode 100644 index 0000000..ac39040 --- /dev/null +++ b/R/readFoodInspections.R @@ -0,0 +1,93 @@ +# TITLE: foodInspectionsEvaluation.R +# AUTHOR: Hugh J. Devlin, Ph. D. +# CREATED: 2014-04-17 + +options(warn=1) + +source("utilities.R") + +readFoodInspections <- function(file) { + result <- read.csv(file, stringsAsFactors=FALSE, na.strings='') + names(result)[names(result)=="Inspection.ID"] <- "ID" # rename column + names(result)[names(result)=="License.."] <- "License" # rename column + names(result)[names(result)=="Facility.Type"] <- "FacilityType" # rename column + names(result)[names(result)=="DBA.Name"] <- "DBA" # rename column + names(result)[names(result)=="AKA.Name"] <- "AKA" # rename column + names(result)[names(result)=="Inspection.Date"] <- "Date" # rename column + names(result)[names(result)=="Inspection.Type"] <- "Type" # rename column + + result$Date <- as.POSIXct(result$Date, format="%m/%d/%Y") + result +} + +cleanFoodInspections <- function(df, validZips, missingRiskFile="../out/MissingRisk.csv") { + result <- df + + # License + licenseNa <- is.na(result$License) + logMsg(paste("Removing facilities with missing license number:", sum(licenseNa))) + print(result[licenseNa , c("License", "DBA", "Address", "City", "State", "Zip", "FacilityType", "Risk", "Type", "Date", "Results")], row.names=FALSE) + result <- result[!licenseNa , ] # subset by license + logMsg(paste('Rows:', nrow(result))) + + licenseZero <- (result$License == 0) + logMsg(paste("Removing facilities with license number zero:", sum(licenseZero))) + print(result[licenseZero , c("License", "DBA", "Address", "City", "State", "Zip", "FacilityType", "Risk", "Type", "Date", "Results")], row.names=FALSE) + result <- result[!licenseZero , ] # subset by license + logMsg(paste('Rows:', nrow(result))) + + # City + logMsg("Upper casing city name") + result$City <- toupper(result$City) + # Acceptable values for Chicago + chicagoSpellingError <- result$City %in% c( + "CHICAGOCHICAGO", + "CHICAGOH", + "CHICAGOI", + "CCHICAGO", + "CHCICAGO", + "CHCHICAGO" + ) + logMsg(paste("Fixing spelling errors in city name:", sum(chicagoSpellingError))) + print(result[chicagoSpellingError , c("License", "DBA", "Address", "City", "State", "Zip", "FacilityType", "Risk", "Type", "Date", "Results")], row.names=FALSE) + result$City[chicagoSpellingError] <- "CHICAGO" + + cityNa <- is.na(result$City) + logMsg(paste("Imputing missing city to CHICAGO:", sum(cityNa))) + print(result[cityNa , c("License", "DBA", "Address", "City", "State", "Zip", "FacilityType", "Risk", "Type", "Date", "Results")], row.names=FALSE) + result$City[cityNa] <- "CHICAGO" + + notChicago <- (result$City != "CHICAGO") + logMsg(paste("Removing facilities outside Chicago:", sum(notChicago))) + print(result[notChicago , c("License", "DBA", "Address", "City", "State", "Zip", "FacilityType", "Risk", "Type", "Date", "Results")], row.names=FALSE) + result <- result[!notChicago , ] # subset by city + result$City <- NULL # done with city + logMsg(paste('Rows:', nrow(result))) + + # State + stateNa <- is.na(result$State) + logMsg(paste("Imputing missing state to IL:", sum(stateNa))) + print(result[stateNa , c("License", "DBA", "Address", "State", "Zip", "FacilityType", "Risk", "Type", "Date", "Results")], row.names=FALSE) + result$State[stateNa] <- "IL" + + notIllinois <- (result$State != "IL") + logMsg(paste("Facilities outside Illinois:", sum(notIllinois))) + result$State <- NULL # done with state + + # Zip + zipNa <- is.na(result$Zip) + logMsg(paste("Facilities missing zip:", sum(zipNa))) + print(result[zipNa , c("License", "DBA", "Address", "Zip", "FacilityType", "Risk", "Type", "Date", "Results")], row.names=FALSE) + + zipNonChicago <- !is.na(result$Zip) & !(result$Zip %in% validZips) + logMsg(paste("Setting zip codes of facilities with zip outside Chicago to NA:", sum(zipNonChicago))) + print(result[zipNonChicago , c("License", "DBA", "Address", "Zip", "FacilityType", "Risk", "Type", "Date", "Results")], row.names=FALSE) + result$Zip[zipNonChicago] <- NA + + result <- factorizeColumns(result, c("FacilityType", "Type", "Risk", "Results")) + + logMsg(paste('Rows:', nrow(result))) + logMsg(paste('Unique license numbers:', length(unique(result$License)))) + + result +} \ No newline at end of file diff --git a/R/readFoodInspectionsHistory.R b/R/readFoodInspectionsHistory.R new file mode 100644 index 0000000..6ddefec --- /dev/null +++ b/R/readFoodInspectionsHistory.R @@ -0,0 +1,15 @@ +# TITLE: foodInspectionsEvaluation.R +# AUTHOR: Hugh J. Devlin, Ph. D. +# CREATED: 2014-04-17 + +options(warn=1) + +source("utilities.R") + +readFoodInspectionsHistory <- function(file) { + result <- read.csv(file, stringsAsFactors=FALSE, na.strings='') + names(result)[names(result)=="License.."] <- "License" # rename column + result$Inspection.Date <- as.POSIXct(result$Inspection.Date, format="%m/%d/%Y") + result <- factorizeColumns(result, c("Facility.Type", "Inspection.Type", "City", "State", "Risk", "Results")) + result +} diff --git a/R/readGarrison.R b/R/readGarrison.R new file mode 100644 index 0000000..6c200b1 --- /dev/null +++ b/R/readGarrison.R @@ -0,0 +1,56 @@ +# TITLE: readGarrison.R +# AUTHOR: Hugh J. Devlin, Ph. D. +# CREATED: 2014-04-17 + +readGarrison <- function(file="../data/InspectionsGarrisonExport20112014.csv") { + result <- read.csv(file, stringsAsFactors=FALSE, na.strings='') + result$License.Type <- NULL # always Food + names(result)[names(result)=="License.Number"] <- "License" + names(result)[names(result)=="Inspector.Assigned"] <- "Sanitarian" + names(result)[names(result)=="Inspection.Date"] <- "Date" + names(result)[names(result)=="Inspection.Purpose"] <- "Purpose" + result$Date <- as.POSIXct(result$Date, format="%m/%d/%Y") + result +} + +cleanGarrison <- function(df) { + result <- df + + licenseMissing <- is.na(result$License) + logMsg(paste("Removing inspections with license number missing:", sum(licenseMissing))) + result <- result[!licenseMissing , ] + + licenseZero <- (result$License == 0) + logMsg(paste("Removing inspections with license number zero:", sum(licenseZero))) + result <- result[!licenseZero , ] + + licenseZeroDashZero <- (result$License == "0-0") + logMsg(paste("Removing inspections with license number 0-0:", sum(licenseZeroDashZero))) + result <- result[!licenseZeroDashZero , ] + + sanitarianBlank <- (result$Sanitarian == " ") + logMsg(paste("Removing inspections with blank inspector:", sum(sanitarianBlank))) + result <- result[!sanitarianBlank , ] + + addressMissing <- is.na(result$Address) + logMsg(paste("Removing inspections with address missing:", sum(addressMissing))) + result <- result[!addressMissing , ] + + result$Zip <- sapply(strsplit(result$Address, split=" ", fixed=TRUE), tail, n=1) + result$Zip <- sapply(strsplit(result$Zip, split="-", fixed=TRUE), head, n=1) # for zip+4 + zipMissing <- (result$Zip == 'IL') + logMsg(paste("Removing inspections with zip missing:", sum(zipMissing))) + result <- result[!zipMissing , ] + result$Zip <- as.integer(result$Zip) + + invalidZip <- (result$Zip == 60627) + logMsg(paste('Removing inspections with invalid zip 60627:', sum(invalidZip, na.rm=TRUE))) + result <- result[!invalidZip , ] + + incompleteCases <- !complete.cases(result) + logMsg(paste("Removing inspections with incomplete cases:", sum(incompleteCases))) + result <- result[!incompleteCases , ] + + result <- factorizeColumns(result, c("Sanitarian", "Purpose")) + result +} diff --git a/R/readGarrisonInspections.R b/R/readGarrisonInspections.R new file mode 100644 index 0000000..6c1f299 --- /dev/null +++ b/R/readGarrisonInspections.R @@ -0,0 +1,67 @@ +# TITLE: readGarrisonInspections.R +# AUTHOR: Hugh J. Devlin, Ph. D. +# CREATED: 2014-04-23 + +readGarrisonInspections <- function(file) { + result <- read.csv(file, stringsAsFactors=FALSE, na.strings='') + result$License.Type <- NULL # always Food + names(result)[names(result)=="License.Number"] <- "License" + names(result)[names(result)=="Inspector.Assigned"] <- "Sanitarian" + names(result)[names(result)=="Inspection.Date"] <- "Date" + names(result)[names(result)=="Inspection.Purpose"] <- "Purpose" + result$Date <- as.POSIXct(result$Date, format="%m/%d/%Y") + result +} + +cleanGarrisonInspections <- function(df, validZips) { + result <- df + + # License + licenseMissing <- is.na(result$License) + logMsg(paste("Removing inspections with license number missing:", sum(licenseMissing))) + result <- result[!licenseMissing , ] + + licenseZero <- (result$License == 0) + logMsg(paste("Removing inspections with license number zero:", sum(licenseZero))) + result <- result[!licenseZero , ] + + licenseZeroDashZero <- (result$License == "0-0") + logMsg(paste("Removing inspections with license number zero dash zero:", sum(licenseZeroDashZero))) + result <- result[!licenseZeroDashZero , ] + + sanitarianBlank <- (result$Sanitarian == " ") + logMsg(paste("Inspections with blank inspector:", sum(sanitarianBlank))) + print(result[sanitarianBlank , ], row.names=FALSE) +# result <- result[!sanitarianBlank , ] + + addressMissing <- is.na(result$Address) + logMsg(paste("Inspections with address missing:", sum(addressMissing))) + print(result[addressMissing , ], row.names=FALSE) +# result <- result[!addressMissing , ] + + result$Zip <- sapply(strsplit(result$Address, split=" ", fixed=TRUE), tail, n=1) + result$Zip <- sapply(strsplit(result$Zip, split="-", fixed=TRUE), head, n=1) # for zip+4 + result$Zip[result$Zip == 'IL'] <- NA + result$Zip <- as.integer(result$Zip) + zipMissing <- is.na(result$Zip) + logMsg(paste("Inspections with zip missing:", sum(zipMissing))) + print(result[zipMissing , ], row.names=FALSE) +# result <- result[!zipMissing , ] + + invalidZip <- !is.na(result$Zip) & !(result$Zip %in% validZips) + logMsg(paste('Inspections with invalid zip:', sum(invalidZip, na.rm=TRUE))) + print(result[invalidZip , ], row.names=FALSE) +# result <- result[!invalidZip , ] + + incompleteCases <- !complete.cases(result) + logMsg(paste("Inspections with incomplete cases:", sum(incompleteCases))) + print(result[incompleteCases , ], row.names=FALSE) +# result <- result[!incompleteCases , ] + + result <- factorizeColumns(result, c("Sanitarian", "Purpose")) + + logMsg(paste('Rows:', nrow(result))) + logMsg(paste('Unique license numbers:', length(unique(result$License)))) + + result +} diff --git a/R/readGarrisonLicenses.R b/R/readGarrisonLicenses.R new file mode 100644 index 0000000..bc76a27 --- /dev/null +++ b/R/readGarrisonLicenses.R @@ -0,0 +1,208 @@ +# AUTHOR: Hugh J. Devlin, Ph. D. +# CREATED: 2014-04-07 + +options(warn=1) + +library(stringr) + +source("utilities.R") + +readGarrisonLicenses <- function(file) { + + # Garrison export has un-escaped double-quotes within double-quotes and commas within double-quotes, so... + result <- readLines(file) + result <- result[-length(result)] # remove last (blank) line + result <- str_trim(result) # remove leading and trailing white space from lines + result <- substr(result, 2, nchar(result) - 1) # remove leading and trailing quote + result <- strsplit(result, "\",\"", fixed = TRUE) # split on quote-comma-quote + result <- lapply(result, str_trim) # remove leading and trailing white space from values + result <- matrix(unlist(result), ncol=12, byrow=TRUE) + + names <- result[1 , ] + result <- data.frame(result[-1 , ], row.names=NULL, stringsAsFactors=FALSE) + names(result) <- names + names(result)[names(result)=="License Number"] <- "License" + names(result)[names(result)=="DBA Name"] <- "DBA" + names(result)[names(result)=="AKA Name"] <- "AKA" + names(result)[names(result)=="Facility Address"] <- "Address" + names(result)[names(result)=="Facility City"] <- "City" + names(result)[names(result)=="Facility State"] <- "State" + names(result)[names(result)=="Facility Zip"] <- "Zip" + names(result)[names(result)=="Risk Category"] <- "Risk" +# names(result)[names(result)=="Status"] <- "Status" + names(result)[names(result)=="License Code"] <- "Code" + names(result)[names(result)=="Assigned Sanitarian"] <- "Sanitarian" + names(result)[names(result)=="Last Inspection Date"] <- "LastInspectionDate" + + result <- naColumns(result) # blanks to na + result <- posixifyColumns(result, "LastInspectionDate") + + result +} + +cleanGarrisonLicenses <- function(df, validZips, missingRiskFile="../out/MissingRisk.csv") { + result <- df + logMsg(paste('Rows:', nrow(result))) + + # License + licenseNa <- is.na(result$License) + logMsg(paste("Removing facilities with missing license number:", sum(licenseNa))) + print(result[licenseNa , c("License", "DBA", "Address", "City", "State", "Zip", "Status", "Risk", "Code", "LastInspectionDate")], row.names=FALSE) + result <- result[!licenseNa , ] # subset by license + logMsg(paste('Rows:', nrow(result))) + + licenseZero <- (result$License == 0) + logMsg(paste("Removing facilities with license number zero:", sum(licenseZero))) + print(result[licenseZero , c("License", "DBA", "Address", "City", "State", "Zip", "Status", "Risk", "LastInspectionDate")], row.names=FALSE) + result <- result[!licenseZero , ] # subset by license + logMsg(paste('Rows:', nrow(result))) + + licenseZeroDashZero <- (result$License == "0-0") + logMsg(paste("Removing facilities with license number zero dash zero:", sum(licenseZeroDashZero))) + print(result[licenseZeroDashZero , c("License", "DBA", "Address", "City", "State", "Zip", "Status", "Risk", "Code", "LastInspectionDate")], row.names=FALSE) + result <- result[!licenseZeroDashZero , ] # subset by license + logMsg(paste('Rows:', nrow(result))) + + # Address + addressNa <- is.na(result$Address) + logMsg(paste("Removing facilities with missing address:", sum(addressNa))) + print(result[addressNa , c("License", "DBA", "Address", "Zip", "Status", "Risk", "Code", "LastInspectionDate")], row.names=FALSE) + result <- result[!addressNa , ] # subset by license + logMsg(paste('Rows:', nrow(result))) + + # City + logMsg("Upper casing city name") + result$City <- toupper(result$City) + # Acceptable values for Chicago + chicagoSpellingError <- result$City %in% c( + "CHICAGOCHICAGO", + "CHICAGOH", + "CHICAGOI", + "CCHICAGO", + "CHCICAGO", + "CHCHICAGO" + ) + logMsg(paste("Fixing spelling errors in city name:", sum(chicagoSpellingError))) + print(result[chicagoSpellingError , c("License", "DBA", "Address", "City", "State", "Zip", "Status", "Risk")], row.names=FALSE) + result$City[chicagoSpellingError] <- "CHICAGO" + + cityNa <- is.na(result$City) + logMsg(paste("Imputing missing city to CHICAGO:", sum(cityNa))) + print(result[cityNa , c("License", "DBA", "Address", "City", "State", "Zip", "Status", "Risk")], row.names=FALSE) + result$City[cityNa] <- "CHICAGO" + + notChicago <- (result$City != "CHICAGO") + logMsg(paste("Removing facilities outside Chicago:", sum(notChicago))) + print(result[notChicago , c("License", "DBA", "Address", "City", "State", "Zip", "Status", "Risk", "LastInspectionDate")], row.names=FALSE) + result <- result[!notChicago , ] # subset by city + result$City <- NULL # done with city + logMsg(paste('Rows:', nrow(result))) + + # State + stateNa <- is.na(result$State) + logMsg(paste("Imputing missing state to IL:", sum(stateNa))) + print(result[stateNa , c("License", "DBA", "Address", "State", "Zip", "Status", "Risk", "LastInspectionDate")], row.names=FALSE) + result$State[stateNa] <- "IL" + + notIllinois <- (result$State != "IL") + logMsg(paste("Facilities outside Illinois:", sum(notIllinois))) + result$State <- NULL # done with state + + # Zip + result$Zip[result$Zip == "60618 6136"] <- "60618" # trim one zip+4 + + zipNa <- is.na(result$Zip) + logMsg(paste("Facilities missing zip:", sum(zipNa))) + print(result[zipNa , c("License", "DBA", "Address", "Zip", "Status", "Risk", "Code")], row.names=FALSE) + + zipNonChicago <- !is.na(result$Zip) & !(result$Zip %in% validZips) + logMsg(paste("Setting zip codes of facilities with zip outside Chicago to NA:", sum(zipNonChicago))) + print(result[zipNonChicago , c("License", "DBA", "Address", "Zip", "Status", "Risk", "Code")], row.names=FALSE) + result$Zip[zipNonChicago] <- NA + + # Risk + riskNa <- is.na(result$Risk) + logMsg(paste("Facilities missing risk:", sum(riskNa))) +# print(result[riskNa , c("License", "DBA", "Address", "Zip", "Status", "Risk", "Code")], row.names=FALSE) + write.table(result[riskNa , c("License", "DBA", "Address", "Zip", "Status", "Risk", "Code")], file=missingRiskFile, sep=',', row.names=FALSE) + + riskAll <- (!riskNa & result$Risk == "All") + logMsg(paste('Setting risk of facilities with a risk of "All" (unclassified) to NA:', sum(riskAll))) + print(result[riskAll , c("License", "DBA", "Address", "Zip", "Status", "Risk", "Code")], row.names=FALSE) + result$Risk[riskAll] <- NA + result$Risk <- as.integer(factor(result$Risk)) + + ambiguousRiskLicenses <- ambiguousRiskActiveLicenses(result) + logMsg(paste('Active licenses with an ambiguous risk:', length(ambiguousRiskLicenses))) + ambiguousRiskLicenses <- result[result$License %in% ambiguousRiskLicenses , ] + print(ambiguousRiskLicenses[order(ambiguousRiskLicenses$License, ambiguousRiskLicenses$LastInspectionDate), c("License", "DBA", "Address", "Zip", "Status", "Risk", "Code", "LastInspectionDate")], row.names=FALSE) + + # Status + statusNa <- is.na(result$Status) + logMsg(paste('Facilities missing status:', sum(statusNa))) + print(result[statusNa , c("License", "DBA", "Address", "Zip", "Status", "Risk", "Code", "LastInspectionDate")], row.names=FALSE) + + # License Code + codeNa <- is.na(result$Code) + logMsg(paste('Facilities missing license code:', sum(codeNa))) + print(result[codeNa , c("License", "DBA", "Address", "Zip", "Status", "Risk", "Code", "LastInspectionDate")], row.names=FALSE) + + logMsg('Sorting by license number and date of last inspection (ascending)') + result <- result[order(result$License, result$LastInspectionDate) , ] + dups <- duplicated(result$License, fromLast=TRUE) + logMsg(paste('Removing all but most recent of duplicate license numbers:', length(dups))) + print(result[dups , c("License", "DBA", "Address", "Status", "Risk", "Code", "LastInspectionDate")], row.names=FALSE) + result <- result[!dups , ] + + result <- factorizeColumns(result, c("Status", "Sanitarian")) + + logMsg(paste('Rows:', nrow(result))) + logMsg(paste('Unique license numbers:', length(unique(result$License)))) + + result +} + +#' @return a list of active license numbers of facilities with more than one risk category +#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} +ambiguousRiskActiveLicenses <- function(df) { + df <- df[df$Status=="Active" , ] + tbl <- table(df$License, df$Risk) + df1 <- data.frame(risk1=tbl[ , 1], risk2=tbl[ , 2], risk3=tbl[ , 3], row.names=rownames(tbl)) + # integer counts to logical + df1$risk1 <- (df1$risk1 > 0) + df1$risk2 <- (df1$risk2 > 0) + df1$risk3 <- (df1$risk3 > 0) + rownames(df1[(df1$risk1 + df1$risk2 + df1$risk3) > 1 , ]) +} + +specialCaseEliminations <- function(df) { + result <- df + logMsg(paste('Rows:', nrow(result))) + + licenseUnitedCenter <- (result$Address %in% c( + "1901 W MADISON ST", + "1901 W MADISON" + )) + logMsg(paste("Removing United Center:", sum(licenseUnitedCenter))) + print(result[licenseUnitedCenter , c("License", "DBA", "Address", "Zip", "Status", "Code")], row.names=FALSE) + result <- result[!licenseUnitedCenter , ] # subset by license + logMsg(paste('Rows:', nrow(result))) + + licenseWrigleyField <- (result$Address %in% c( + "1060 W ADDISON ST" + )) + logMsg(paste("Removing Wrigley Filed:", sum(licenseWrigleyField))) + print(result[licenseWrigleyField , c("License", "DBA", "Address", "Zip", "Status", "Code")], row.names=FALSE) + result <- result[!licenseWrigleyField , ] # subset by license + logMsg(paste('Rows:', nrow(result))) + + licenseCellularField <- (result$Address %in% c( + "333 W 35TH ST" + )) + logMsg(paste("Removing Cellular Field:", sum(licenseCellularField))) + print(result[licenseCellularField , c("License", "DBA", "Address", "Zip", "Status", "Code")], row.names=FALSE) + result <- result[!licenseCellularField , ] # subset by license + logMsg(paste('Rows:', nrow(result))) + + result +} \ No newline at end of file diff --git a/R/runCheckZips.R b/R/runCheckZips.R new file mode 100644 index 0000000..e67eb82 --- /dev/null +++ b/R/runCheckZips.R @@ -0,0 +1,8 @@ +# TODO: Add comment +# +# Author: 368982 +############################################################################### + +source("foodInspectionZipCodes.R") + +run() \ No newline at end of file diff --git a/R/tests/runAllTests.R b/R/tests/runAllTests.R new file mode 100644 index 0000000..51f5b58 --- /dev/null +++ b/R/tests/runAllTests.R @@ -0,0 +1,19 @@ +# Run RUnit tests +# +# Author: Hugh 2013-07-15 +############################################################################### + +options(warn=1) + +library('RUnit') + +test.suite <- defineTestSuite( + "all tests", + dirs = file.path("tests"), + testFileRegexp = '^test.*\\.R' +) + +runAllTests <- function() { + test.result <- runTestSuite(test.suite) + printTextProtocol(test.result) +} diff --git a/R/tests/testAnonymize.R b/R/tests/testAnonymize.R new file mode 100644 index 0000000..f895e29 --- /dev/null +++ b/R/tests/testAnonymize.R @@ -0,0 +1,16 @@ +# TODO: Add comment +# +# Author: 368982 +############################################################################### + +library('RUnit') + +source("utilities.R") + +df <- data.frame(name=c("foo", "bar", "foo", "bas"), attribute=c(1, 2, 3, 4)) + +test.anonymizeColumns <- function() { + actual <- anonymizeColumns(df, c("name")) + checkEquals(actual$name, c(3, 1, 3, 2), "name identifiers") + checkEquals(actual$attribute, c(1, 2, 3, 4), "attributes the same") +} diff --git a/R/tests/testViolations.R b/R/tests/testViolations.R new file mode 100644 index 0000000..901eddf --- /dev/null +++ b/R/tests/testViolations.R @@ -0,0 +1,26 @@ +# TODO: Add comment +# +# Author: 368982 +############################################################################### + +library('RUnit') + +source("foodInspectionUtilities.R") + +violations <- c( + "29. PREVIOUS MINOR VIOLATION(S) CORRECTED 7-42-090 - Comments: PREVIOS MINOR VIOLATIONS, FROM REPORT #1154846 DATED 01-31-2013, NOT CORRECTED: (#35)SEAL OPENINGS ALONG BASEBOARDS IN ALL AREAS,PAINT RAW WOOD BASEBOARDS IN PREP AREA,REPLACE STAINED CEILING TILES IN ALL AREAS,REPLACE MISSING OUTLET COVERS IN ALL AREAS(#40)PROVIDE THERMOMETERS IN ALL COOLERS AND METAL STEM THERMOMETERS FOR INTERNAL TEMPERATURES. INSTRUCTED TO CORRECT VIOLATIONS. SERIOUS VIOLATION 7-42-090. | 36. LIGHTING: REQUIRED MINIMUM FOOT-CANDLES OF LIGHT PROVIDED, FIXTURES SHIELDED - Comments: LIGHTS INSIDE FRONT COOKING AREA HOOD ARE NOT SHIELDED. MUST INSTALL LIGHT SHIELDS OR SHATTER RESISTANT BULBS. | 13. NO EVIDENCE OF RODENT OR INSECT INFESTATION, NO BIRDS, TURTLES OR OTHER ANIMALS - Comments: EVIDENCE OF RODENT INFESTATION. OBSERVED OVER 50 MICE DROPPINGS ON FLOOR ALONG WALL BASE AND CORNERS BEHIND A SHELVING UNIT NEXT TO THE WASHROON IN REAR OF PREMISES, OVER 20 MICE DROPPINGS ON BOTTOM SHELF OF PREP TABLE IN THE KITCHEN AREA NEAR THE MEAT SLICER, POTS AND PANS ARE STORED, 0VER 20 MICE DROPPINGS ON FLOOR ALONG WALL BASE BEHIND A TABLE IN THE STORAGE AREA, OVER 30 MICE DROPPINGS ON FLOOR ALONG WALL BASE UNDERNEATH THE RADIATOR AND SHELVING UNIT IN KITCHEN AREA, OVER 25 MICE DROPPINGS UNDERNEATH BOTTOM OF TABLE WHERE THE COFFEE MACHINE IS STORED, 10 MICE DROPPINGS ON A GLUE TRAP BEHIND THE CHEST FREEZER IN THE STORAGE AREA, 0VER 20 MICE DROPPINGS UNDERNEATH A SHELVING UNIT IN THE STORAGE CLOSET IN REAR NEXT TO THE WASHROOM, OVER 15 MICE DROPPINGS ON FLOOR BEHIND TABLE WHERE TROPHYS AREA STORED. OUTER OPENING NOT PROTECTED. INSTALL MESH SCREEN AT REAR WALL VENT. CRITICAL VIOLATION 7-38-020. MUST REMOVE ALL DROPPINGS, WASH, RINSE AND SANITIZE ALL SURFACES AND EQUIPMENT, HAVE PREMISES SERVICED BY PEST CONTROL COMPANY AND COMPLY WITH ALL FOLLOWING ORDERS. | 2. FACILITIES TO MAINTAIN PROPER TEMPERATURE - Comments: THE AIR TEMPERATURE TAKEN OF THE PREP COOLER IN THE PREP AREA READS 50.9F. POTENTIALLY HAZARDOUS FOODS IN COOLER. CRITICAL VIOLATION 7-38-005(A). COOLER MUST BE REPAIRED AND ABLE TO MAINTAIN FOODS AT 40F OR LESS. | 3. POTENTIALLY HAZARDOUS FOOD MEETS TEMPERATURE REQUIREMENT DURING STORAGE, PREPARATION DISPLAY AND SERVICE - Comments: THE INTERNAL TEMPERATURES TAKEN OF POTENTIALLY HAZARDOUS FOODS INSIDE THE PREP COOLER IN THE FRONT PREP AREA RANGE FROM 46.7F TO 50.9F,FOOD ITEMS ARE SHREDDED CHEESE, SLICED TOMATOES, SOUR CREAM, SHREDDED LETTUCE, SAUSAGE, AND HAM. MANAGER VOLUNTARILY DENATURED AND DESTROYED FOOD ITEMS TOTAL WEIGHT 10LBS, TOTAL COST $50. CRITICAL VIOLATION 7-38-005(A). | 41. PREMISES MAINTAINED FREE OF LITTER, UNNECESSARY ARTICLES, CLEANING EQUIPMENT PROPERLY STORED - Comments: MUST REMOVE EXCESSIVE CLUTTER AND UNNECESSARY ITEMS (CAR BATTERY, TROPHYS, UNUSED EQUIPMENT, BOXES ETC.) THROUGHOUT PREMISES. ESPECIALLY UNDERNEATH COUNTER TOP TABLES. ALSO REMOVE EXCESSIVE CLUTTER FROM THE OUTSIDE REAR PORCH AREA TO PREVENT RODENT HARBORAGE. | 34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOOD REPAIR, COVERING INSTALLED, DUST-LESS CLEANING METHODS USED - Comments: FLOORS NEED DETAIL CLEANING THROUGHOUT PREMISES ALONG WALL BASES, CORNERS AND UNDER SHELVES AND EQUIPMENT. REPLACE MISSING FLOOR TILES IN FRONT OF 3 COMPARTMENT SINK IN REAR KITCHEN AREA AND SEAL HOLES/OPENINGS IN FLOOR IN THE DINNING AREA TO PREVENT RODENT ENTRY OR HARBORAGE. | 33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSILS CLEAN, FREE OF ABRASIVE DETERGENTS - Comments: DETAIL CLEAN ALL COOKING EQUIPMENT OF GREASE AND FOOD DEBRIS BUILD-UP AND DEEP FRYER, CLEAN INTERIOR AND EXTERIOR OF ALL COLD HOLDING UNITS OF SPILLS AND OLD FOOD DEBRIS AND DETAIL CLEAN ALL PREP TABLES, SHELVES, COUNTERS, RACKS AND CABINETS THROUGHOUT PREMISES. | 35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTRUCTED PER CODE: GOOD REPAIR, SURFACES CLEAN AND DUST-LESS CLEANING METHODS - Comments: DETAIL CLEAN WALLS BEHIND STOVE AND NEAR THE PREP COOLER IN PREP AREA OF FOOD DEBRIS. ALSO SEAL HOLE IN THE REAR AREA WHERE THE OFFICE ENTRANCE IS LOCATED. STAINED AND DIRTY CEILING TILE IN DINING, PREP AND STORAGE AREAS. MUST REPLACE ALL MISSING AND STAINED CEILING TILES. ACCUMULATED DUST ON WALL/CEILING VENT COVERS. MUST CLEAN. ACCUMULATED DUST/DEBRIS/GREASE IN VENT HOODS AND FILTERS ABOVE COOKING EQUIPMENT (FRONT AND REAR). MUST CLEAN AND MAINTAIN HOODS/FILTERS. | 32. FOOD AND NON-FOOD CONTACT SURFACES PROPERLY DESIGNED, CONSTRUCTED AND MAINTAINED - Comments: MUST REMOVE RUST FROM THE GREASE TRAP AT THE 3 COMPARTMENT SINK IN REAR KITCHEN AREA | 30. FOOD IN ORIGINAL CONTAINER, PROPERLY LABELED: CUSTOMER ADVISORY POSTED AS NEEDED - Comments: MUST LABEL AND DATE ALL PREPARED FOODS IN COOLERS HELD OVER 24 HOURS | 38. VENTILATION: ROOMS AND EQUIPMENT VENTED AS REQUIRED: PLUMBING: INSTALLED AND MAINTAINED - Comments: PLUMBING UNDER 3 COMPARTMENT SINK LEAKS. MUST REPAIR AND MAINTAIN PLUMBING.", + "43. FOOD (ICE) DISPENSING UTENSILS, WASH CLOTHS PROPERLY STORED - Comments: OBSERVED ICE SCOOPS STORED IN WITH THE ICE IN THE ICE BINS. MANAGEMENT INSTRUCTED TO STORE THE ICE SCOOPS OUTSIDE OF THE ICE BIN IN A CLEAN CONTAINER.", + "" +) + +test.parseViolationCodes <- function() { + checkEquals(parseViolationCodes(violations), list(c(29, 36, 13, 2, 3, 41, 34, 33, 35, 32, 30, 38), c(43), integer(0)), "violation codes") +} + +test.classifyViolations <- function() { + checkEquals(classifyViolations(violations), list(c(2, 3, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3), c(3), logical(0)), "violation severities") +} + +test.violationCounts <- function() { + checkEquals(violationCounts(violations), matrix(c(3, 1, 8, 0, 0, 1, 0, 0, 0), ncol=3, byrow=TRUE, dimnames=list(NULL, c("critical", "serious", "minor"))), "violation counts") +} diff --git a/R/utilities.R b/R/utilities.R new file mode 100644 index 0000000..dc835f6 --- /dev/null +++ b/R/utilities.R @@ -0,0 +1,157 @@ +# project-independent utility functions +# +# Author: Hugh +############################################################################### + +library(lubridate) + +#' Sun, Mon, etc. +#' +#' @author Hugh J. Devlin \email{Hugh.Devlin@@cityofchicago.org} +WEEKDAY_ABBREVIATIONS = levels(wday(1:7, label=TRUE)) + +#' Time-stamped message +#' +#' construct a time-stamped, origin-stamped log message +#' +#' @param s a string +#' @return a string +#' @author Hugh J. Devlin \email{Hugh.Devlin@@cityofchicago.org} +prefixMsg <- function(s, i) { + paste(format(Sys.time(), "%Y-%m-%d %H:%M:%OS3 "), as.character(sys.call(i))[1], ": ", s, '\n', sep='') +} + +#' Time-stamped stop message +#' +#' construct a time-stamped, origin-stamped stop message. +#' @param s a string +#' @return a string +#' @author Hugh J. Devlin \email{Hugh.Devlin@@cityofchicago.org} +stopMsg <- function(s) { + prefixMsg(s, -3L) +} + +#' Time-stamped console message +#' +#' Issue a time-stamped, origin-stamped log message. +#' @param s a string +#' @return None (invisible NULL) as per cat +#' @author Hugh J. Devlin \email{Hugh.Devlin@@cityofchicago.org} +logMsg <- function(s) { + cat(prefixMsg(s, -3L)) +} + +#' is whole number +#' +#' http://cran.r-project.org/doc/FAQ/R-FAQ.html#Why-doesn_0027t-R-think-these-numbers-are-equal_003f +#' +#' @param x a numeric +#' @param tolerance defaults to machine discrimination +#' @return Boolean +#' @author Hugh J. Devlin \email{Hugh.Devlin@@cityofchicago.org} +is.wholeNumber <- function(x, tolerance=(.Machine$double.eps ^ 0.5)) { + return(abs(x - round(x)) < tolerance) +} + +#' safe.ifelse +#' +#' Like R base ifelse but preserves class of result +#' After Hadley Wickham from StackOverflow +#' +#' @return object of same class as yes +#' @author Hugh J. Devlin \email{Hugh.Devlin@@cityofchicago.org} +safe.ifelse <- function(cond, yes, no) structure(ifelse(cond, yes, no), class = class(yes)) + +#' avoid integer overflow +#' +#' @param x a numeric object or an object which may be coerced to numeric +#' @return a numeric object +#' @author Hugh J. Devlin \email{Hugh.Devlin@@cityofchicago.org} +sumAsNumeric <- function(x) { + sum(as.numeric(x)) +} + +#' construct a file path for a graph based on the input file and a label +#' +#' @param filePath +#' @param label +#' @return a filePath +#' @author Hugh J. Devlin \email{Hugh.Devlin@@cityofchicago.org} +svgFilePath <- function(filePath, label) { + file.path('..', 'out', gsub('\\.rds$', paste(label, 'svg', sep="."), basename(filePath))) +} + +#' save current plot as svg +#' +#' @param filePath +#' @return +#' @author Hugh J. Devlin \email{Hugh.Devlin@@cityofchicago.org} +saveAsSvg <- function(filePath) { + svg(filePath) + dev.set(which=dev.prev()) + dev.copy(which=dev.prev()) + dev.off() +} + +#' overview +#' +#' @param x an R object +#' @return NULL +#' @author Hugh J. Devlin \email{Hugh.Devlin@@cityofchicago.org} +summarize <- function(x) { + str(x) + print(summary(x)) + NULL +} + +#' convert named columns in a data frame to factors +#' @param df a data frame +#' @param columnNames a vector of column names +#' @return the data frame +#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} +factorizeColumns <- function(df, columnNames) { + result <- df + for(columnName in columnNames) { + result[[columnName]] <- factor(result[[columnName]]) + } + result +} + +#' convert named columns in a data frame to factors and then to integers +#' @param df a data frame +#' @param columnNames a vector of column names +#' @return the data frame +#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} +anonymizeColumns <- function(df, columnNames) { + result <- df + for(columnName in columnNames) { + result[[columnName]] <- as.integer(factor(result[[columnName]])) + } + result +} + +#' convert named columns in a data frame to POSIX dates +#' @param df a data frame +#' @param columnNames a vector of column names of data columns +#' @return the data frame +#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} +posixifyColumns <- function(df, columnNames, format="%m/%d/%Y") { + result <- df + for(columnName in columnNames) { + result[[columnName]] <- as.POSIXct(result[[columnName]], format=format) + } + result +} + +#' convert blanks in named columns in a data frame to NA +#' @param df a data frame +#' @param columnNames a vector of column names of data columns +#' @return the data frame +#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} +naColumns <- function(df, columnNames=colnames(df), na="") { + result <- df + for(columnName in columnNames) { + result[[columnName]] <- ifelse(result[[columnName]] == na, NA, result[[columnName]]) + } + result +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..d16eebd --- /dev/null +++ b/README.md @@ -0,0 +1,17 @@ +This repository contains the necessary data and scripts to evaluate the effectiveness of the City of Chicago's food inspections pilot. + +# Files ++ Inspection List.csv - contains the list of restaurants to be visited by the Chicago Department of Public Health (CDPH). ++ foodInspectionsEvaluation.R - is the script which munges data and computes the effectiveness of the pilot. + +# To Run +To run, copy the repository to a local directory. From the food-inspections-evaluation directory, run the R file using the RScript utility. + +```bash +$ git clone https://github.com/Chicago/food-inspections-evaluation +$ cd food-inspections-evaluation +$ RScript foodInspectionsEvaluation.R +``` + +# License +Copyright, 2014 City of Chicago diff --git a/runCheckZips.bat b/runCheckZips.bat new file mode 100644 index 0000000..7428603 --- /dev/null +++ b/runCheckZips.bat @@ -0,0 +1,3 @@ +cd R +pwd +U:\Programs\R\R-3.0.3\bin\R CMD BATCH --no-restore --no-save ./runCheckZips.R ../out/runCheckZips.log \ No newline at end of file