-
Notifications
You must be signed in to change notification settings - Fork 130
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Tom Schenk
committed
Sep 12, 2014
0 parents
commit f3035a2
Showing
23 changed files
with
1,505 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
/FoodInspectionEvaluation.zip | ||
/.project | ||
/out | ||
/data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
# TITLE: Sanitarians.R | ||
# AUTHOR: Hugh J. Devlin, Ph. D. | ||
# CREATED: 2014-04-17 | ||
|
||
options(warn=1) | ||
options(error=utils::recover) | ||
options(max.print=15000) | ||
options(width=200) | ||
|
||
source("readGarrison.R") | ||
source("readFoodInspectionsHistory.R") | ||
source("overPlotYearsByMonth.R") | ||
|
||
# Matching inspections data | ||
analyzeSanitarians <- function(garrison, portal) { | ||
|
||
# decorate Garrison data frame | ||
garrison$year <- year(garrison$Date) | ||
garrison$month <- month(garrison$Date) | ||
|
||
logMsg(paste('Unique license numbers in Garrison inspections =', length(unique(garrison$License)))) | ||
logMsg(paste('Unique zip codes in Garrison inspections =', length(unique(garrison$Zip)))) | ||
logMsg(paste('Unique sanitarian in Garrison inspections =', length(unique(garrison$Inspector)))) | ||
summarize(garrison) | ||
|
||
logMsg(paste('Unique license numbers in portal inspections =', length(unique(portal$License)))) | ||
logMsg(paste('Unique zip codes in portal inspections =', length(unique(portal$Zip)))) | ||
summarize(portal) | ||
|
||
logMsg('Garrison inspections by sanitarian:') | ||
bySanitarian <- as.matrix(table(garrison$Inspector)) | ||
print(bySanitarian) | ||
print(summary(as.integer(bySanitarian))) | ||
|
||
logMsg('Garrison inspections by year and month:') | ||
byYearMonth <- table(garrison$year, garrison$month) | ||
print(byYearMonth) | ||
dev.new() | ||
overPlotYearsByMonth( | ||
main="Garrison inspections by year and month", | ||
ylab="Inspections", | ||
ylim=range(byYearMonth) | ||
) | ||
lapply(2011:2014, plotYearByMonth, byYearMonth) | ||
addMonthAxisAndYearsLegend(years=2011:2014) | ||
dev.copy(svg, filename="../out/GarrisonInspectionsByYearAndMonth.svg") | ||
dev.off() | ||
|
||
duplicatedGarrison <- duplicated(garrison[ , c("License", "Date")]) | ||
logMsg(paste("Removing duplicate inspections from Garrison:", sum(duplicatedGarrison))) | ||
garrison <- garrison[!duplicatedGarrison , ] | ||
|
||
duplicatedPortal <- duplicated(portal[ , c("License", "Inspection.Date")]) | ||
logMsg(paste("Removing duplicate inspections from portal:", sum(duplicatedPortal))) | ||
portal <- portal[!duplicatedPortal , ] | ||
|
||
df <- merge( | ||
subset(garrison, select=-c(Address)), | ||
subset(portal, select=-c(Zip, DBA.Name, AKA.Name, Inspection.Type, City, State, Latitude, Longitude, Location)), | ||
by.x=c("License", "Date"), by.y=c("License", "Inspection.Date")) | ||
duplicatedDf <- duplicated(df[ , c("License", "Date")]) | ||
logMsg(paste("Removing duplicate inspections:", sum(duplicatedDf))) | ||
df <- df[!duplicatedDf , ] | ||
logMsg(paste('Unique license numbers in inspections =', length(unique(df$License)))) | ||
logMsg(paste('Unique zip codes in inspections =', length(unique(df$Zip)))) | ||
logMsg(paste('Unique sanitarians in inspections =', length(unique(df$Inspector)))) | ||
summarize(df) | ||
|
||
logMsg('Inspections by sanitarian and year:') | ||
bySanitarianAndYear <- table(df$Inspector, df$year) | ||
print(bySanitarianAndYear) | ||
dev.new() | ||
par(las=1) # horizontal axis labels | ||
stripchart(values ~ ind, | ||
data=stack(as.data.frame.matrix(bySanitarianAndYear)), | ||
pch=20, | ||
main="Inspections by sanitarian and year", | ||
xlab="Year", | ||
ylab="Inspections", | ||
vertical=TRUE, | ||
col="blue" | ||
) | ||
apply(bySanitarianAndYear, 1, lines, col="blue") | ||
dev.copy(svg, filename="../out/InspectionsBySanitarianAndYear.svg") | ||
dev.off() | ||
|
||
logMsg('Fails by sanitarian and year:') | ||
fails <- df[df$Results == 'Fail' , ] | ||
failsBySanitarianAndYear <- table(fails$Inspector, fails$year) | ||
print(failsBySanitarianAndYear) | ||
dev.new() | ||
par(las=1) # horizontal axis labels | ||
stripchart(values ~ ind, | ||
data=stack(as.data.frame.matrix(failsBySanitarianAndYear)), | ||
pch=20, | ||
main="Fails by sanitarian and year", | ||
xlab="Year", | ||
ylab="Fails", | ||
vertical=TRUE, | ||
col="blue" | ||
) | ||
apply(failsBySanitarianAndYear, 1, lines, col="blue") | ||
dev.copy(svg, filename="../out/FailsBySanitarianAndYear.svg") | ||
dev.off() | ||
|
||
logMsg('Fail counts correlation by sanitarian from year-to-year:') | ||
print(cor(failsBySanitarianAndYear, use="pairwise.complete.obs")) | ||
logMsg('Fail counts by sanitarian correlation 2012 to 2013 with confidence interval and p-value:') | ||
print(cor.test(failsBySanitarianAndYear[ , "2013"], failsBySanitarianAndYear[ , "2014"], use="pairwise.complete.obs")) | ||
|
||
logMsg('Failure rates by sanitarian and year:') | ||
failureRateBySanitarianAndYear <- failsBySanitarianAndYear / bySanitarianAndYear | ||
print(failureRateBySanitarianAndYear) | ||
dev.new() | ||
par(las=1) # horizontal axis labels | ||
stripchart(values ~ ind, | ||
data=stack(as.data.frame.matrix(failureRateBySanitarianAndYear)), | ||
pch=20, | ||
main="Failure rates by sanitarian and year", | ||
xlab="Year", | ||
ylab="Failure rate", | ||
vertical=TRUE, | ||
col="blue" | ||
) | ||
apply(failureRateBySanitarianAndYear, 1, lines, col="blue") | ||
dev.copy(svg, filename="../out/FailureRatesBySanitarianAndYear.svg") | ||
dev.off() | ||
|
||
logMsg('Failure rate correlation by sanitarian from year-to-year:') | ||
print(cor(failureRateBySanitarianAndYear, use="pairwise.complete.obs")) | ||
logMsg('Failure rate by sanitarian correlation 2013 to 2014 with confidence interval and p-value:') | ||
print(cor.test(failureRateBySanitarianAndYear[ , "2013"], failureRateBySanitarianAndYear[ , "2014"], use="pairwise.complete.obs")) | ||
|
||
sanitarians <- data.frame( | ||
Inspections2011to2013=rowSums(bySanitarianAndYear[, c("2011", "2012", "2013")]), | ||
Fails2011to2013=rowSums(failsBySanitarianAndYear[, c("2011", "2012", "2013")]), | ||
Inspections2014=bySanitarianAndYear[,"2014"], | ||
Fails2014=failsBySanitarianAndYear[,"2014"] | ||
) | ||
sanitarians$Rate2011to2013 <- sanitarians$Fails2011to2013 / sanitarians$Inspections2011to2013 | ||
sanitarians$Rate2014 <- sanitarians$Fails2014 / sanitarians$Inspections2014 | ||
print(sanitarians) | ||
logMsg('Failure rate by sanitarian correlation 2013 to 2014 with confidence interval and p-value:') | ||
print(cor.test(sanitarians$Rate2011to2013, sanitarians$Rate2014, use="pairwise.complete.obs")) | ||
|
||
logMsg('Done.') | ||
} | ||
|
||
run <- function() { | ||
logMsg('Reading Garrison food inspection history') | ||
garrison <- cleanGarrison(readGarrison()) | ||
logMsg(paste('Garrison food inspection history records:', nrow(garrison))) | ||
|
||
logMsg('Reading portal food inspection history') | ||
portal <- readFoodInspectionsHistory() | ||
logMsg(paste('Portal food inspection history records:', nrow(portal))) | ||
|
||
analyzeSanitarians(garrison, portal) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
|
||
source("utilities.R") | ||
source("readGarrison.R") | ||
|
||
run <- function(file="../out/AnonymizedInspectionsGarrisonExport20112014.csv") { | ||
logMsg('Reading Garrison food inspection history') | ||
garrison <- readGarrison() | ||
logMsg(paste('Garrison food inspection history records:', nrow(garrison))) | ||
logMsg('Anonymizing Garrison food inspection history records...') | ||
garrison <- anonymizeColumns(garrison, c("Inspector")) | ||
logMsg('Writing Garrison food inspection history') | ||
write.table(garrison, file=file, sep=',', row.names=FALSE) | ||
logMsg('Done.') | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
# Food inspection utilities | ||
# | ||
# Author: 368982 | ||
############################################################################### | ||
|
||
# split on vertical bar, then on period after the violation code number, keep the code numbers | ||
parseViolationCodes <- function(violations) { | ||
lapply(lapply(strsplit(violations, "| ", fixed=TRUE), strsplit, ". ", fixed=TRUE), function(x) as.integer(lapply(x, FUN='[', 1))) | ||
} | ||
|
||
# critical=1, serious=2, minor=3, other=0 | ||
classifyViolations <- function(violations) { | ||
lapply(parseViolationCodes(violations), function(x) ifelse(x %in% 1:14, 1, ifelse(x %in% 15:29, 2, ifelse(x %in% 30:44, 3, 0)))) | ||
} | ||
|
||
# counts by criticality | ||
# tabulate ignores zeroes | ||
# returns an integer matrix with a row for each inspection, and 3 columns for critical, serious, and minor counts | ||
violationCounts <- function(violations) { | ||
result <- do.call(rbind, lapply(classifyViolations(violations), function(x) tabulate(as.integer(x), nbins=3))) | ||
colnames(result) <- c("critical", "serious", "minor") | ||
result | ||
} | ||
|
||
# add violation counts to inspections data frame | ||
countViolations <- function(inspections) { | ||
result <- inspections | ||
violationCounts <- violationCounts(inspections$Violations) | ||
|
||
result$criticalCount <- violationCounts[ , "critical"] | ||
result$seriousCount <- violationCounts[ , "serious"] | ||
result$minorCount <- violationCounts[ , "minor"] | ||
|
||
result$serious <- (result$seriousCount > 0) | ||
result$critical <- (result$criticalCount > 0) | ||
result$minor <- (result$minorCount > 0) | ||
|
||
result | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
# TITLE: foodInspectionZipCodes.R | ||
# AUTHOR: Hugh J. Devlin, Ph. D. | ||
# CREATED: 2014-04-07 | ||
|
||
options(warn=1) | ||
options(error=utils::recover) | ||
options(max.print=15000) | ||
options(width=200) | ||
|
||
source("utilities.R") | ||
|
||
posixifyColumns <- function(df, columnNames) { | ||
result <- df | ||
for(columnName in columnNames) { | ||
result[[columnName]] <- as.POSIXct(result[[columnName]], format="%m/%d/%Y") | ||
} | ||
result | ||
} | ||
|
||
readInspectionHistory <- function(file="../data/Food_Inspections.csv") { | ||
result <- read.csv(file, stringsAsFactors=FALSE, na.strings='') | ||
names(result)[names(result)=="License.."] <- "License" # "License #" on the portal, rename merge column | ||
result$Inspection.Date <- as.POSIXct(result$Inspection.Date, format="%m/%d/%Y") | ||
result | ||
} | ||
|
||
readLicenses <- function(file="../data/Business_Licenses.csv") { | ||
result <- read.csv(file, stringsAsFactors=FALSE, na.strings='') | ||
result <- posixifyColumns(result, | ||
c( "APPLICATION.CREATED.DATE", | ||
"APPLICATION.REQUIREMENTS.COMPLETE", | ||
"PAYMENT.DATE", | ||
"LICENSE.TERM.START.DATE", | ||
"LICENSE.TERM.EXPIRATION.DATE", | ||
"LICENSE.APPROVED.FOR.ISSUANCE", | ||
"DATE.ISSUED", | ||
"LICENSE.STATUS.CHANGE.DATE" | ||
)) | ||
result | ||
} | ||
|
||
# Matching inspections data | ||
compareZips <- function(inspections, licenses, startDate="2014-01-01", | ||
licensesNotFoundPath='../out/licensesNotFound.csv', | ||
zipCodeDiscrepencyPath='../out/foodInspectionZipCodeDiscrepencies.csv') { | ||
|
||
#subset food inspections by start date | ||
inspections <- subset(inspections, Inspection.Date >= as.POSIXct(startDate, format="%Y-%m-%d")) | ||
inspections <- inspections[complete.cases(inspections[ , c("License")]) , ] | ||
logMsg(paste('Food inspections after', startDate, ':', nrow(inspections))) | ||
summarize(inspections) | ||
|
||
inspectionLicenseNumbers <- unique(inspections$License) | ||
logMsg(paste('Unique license numbers in food inspections =', length(inspectionLicenseNumbers))) | ||
logMsg(paste('Unique zip codes in food inspections =', length(unique(inspections$Zip)))) | ||
logMsg('Food inspections by zip code:') | ||
print(sort(table(inspections$Zip), decreasing=TRUE)) | ||
|
||
# subset to currently active business licenses | ||
licenses <- licenses[complete.cases(licenses[ , c("ACCOUNT.NUMBER", "LICENSE.NUMBER", "LICENSE.CODE", "LICENSE.TERM.START.DATE", "LICENSE.TERM.EXPIRATION.DATE")]) , ] | ||
licenses <- licenses[licenses$LICENSE.STATUS == "AAI" , ] | ||
licenses <- licenses[licenses$LICENSE.TERM.START.DATE <= Sys.time() & Sys.time() <= licenses$LICENSE.TERM.EXPIRATION.DATE , ] | ||
# Keep only the most recent of duplicates | ||
licenses <- licenses[with(licenses, order(ACCOUNT.NUMBER, LICENSE.NUMBER, LICENSE.CODE, LICENSE.TERM.START.DATE)) , ] # sort | ||
licenses <- licenses[!duplicated(licenses$LICENSE.NUMBER, fromLast=TRUE) , ] | ||
summarize(licenses) | ||
|
||
licenseNumbers <- unique(licenses$LICENSE.NUMBER) | ||
logMsg(paste('Unique license numbers in business licenses =', length(licenseNumbers))) | ||
logMsg(paste('Unique zip codes in business licenses =', length(unique(licenses$ZIP.CODE)))) | ||
logMsg('Licenses by zip code:') | ||
byZipCode <- table(licenses$ZIP.CODE) | ||
print(sort(byZipCode, decreasing=TRUE)) | ||
|
||
duplicates <- duplicated(licenses$LICENSE.NUMBER, fromLast=FALSE) | duplicated(licenses$LICENSE.NUMBER, fromLast=TRUE) | ||
logMsg(paste('Duplicate active license numbers in business licenses:', sum(duplicates))) | ||
|
||
logMsg(paste('Unique license numbers in business licenses =', length(unique(licenses$LICENSE.NUMBER)))) | ||
|
||
# inspections not found in business licenses | ||
inspectionsNotFound <- inspections[!(inspections$License %in% licenseNumbers) , c("License" , "DBA.Name", "Address", "Inspection.Date", "Results")] | ||
inspectionsNotFound <- inspectionsNotFound[with(inspectionsNotFound, order(License, Inspection.Date)) , ] # sort | ||
inspectionsNotFound <- inspectionsNotFound[!duplicated(inspectionsNotFound$License, fromLast=TRUE) , ] # keep only most recent | ||
inspectionsNotFound <- inspectionsNotFound[inspectionsNotFound$Result != "Business Not Located", ] | ||
inspectionsNotFound <- inspectionsNotFound[inspectionsNotFound$Result != "Out of Business", ] | ||
logMsg(paste('License numbers in inspections not found in business licenses =', nrow(inspectionsNotFound))) | ||
write.table(inspectionsNotFound, file=licensesNotFoundPath, sep=',', row.names=FALSE) | ||
|
||
# zip code discrepencies between the inspections and the business licenses | ||
inspections <- inspections[complete.cases(inspections[ , c("Zip")]) , ] | ||
logMsg(paste('Inspections with zip codes =', nrow(inspections))) | ||
licenses <- licenses[complete.cases(licenses[ , c("ZIP.CODE")]) , ] | ||
logMsg(paste('Business licenses with zip codes =', nrow(licenses))) | ||
inspections <- merge(inspections, licenses, by.x="License", by.y="LICENSE.NUMBER") | ||
logMsg(paste('Inspections matching business license on license number =', nrow(inspections))) | ||
inspections <- inspections[(inspections$Zip != inspections$ZIP.CODE) , ] | ||
logMsg(paste('Inspections with zip codes different from business license =', nrow(inspections))) | ||
inspections <- inspections[ , c("License", "Inspection.Date", "Results", "DBA.Name", "Address", "Zip", "ZIP.CODE")] | ||
names(inspections)[names(inspections)=="Zip"] <- "Food Inspection Zip" | ||
names(inspections)[names(inspections)=="ZIP.CODE"] <- "Business License Zip" | ||
inspections <- inspections[with(inspections, order(License, Inspection.Date)) , ] # sort | ||
write.table(inspections, file=zipCodeDiscrepencyPath, sep=',', row.names=FALSE) | ||
|
||
logMsg('Done.') | ||
} | ||
|
||
run <- function() { | ||
logMsg('Reading food inspections') | ||
fi <- readInspectionHistory() | ||
logMsg(paste('Food inspections:', nrow(fi))) | ||
logMsg('Reading licenses') | ||
li <- readLicenses() | ||
logMsg(paste('Licenses:', nrow(li))) | ||
compareZips(fi, li) | ||
} |
Oops, something went wrong.