Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Tom Schenk committed Sep 12, 2014
0 parents commit f3035a2
Show file tree
Hide file tree
Showing 23 changed files with 1,505 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
/FoodInspectionEvaluation.zip
/.project
/out
/data
159 changes: 159 additions & 0 deletions R/Sanitarians.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
# TITLE: Sanitarians.R
# AUTHOR: Hugh J. Devlin, Ph. D.
# CREATED: 2014-04-17

options(warn=1)
options(error=utils::recover)
options(max.print=15000)
options(width=200)

source("readGarrison.R")
source("readFoodInspectionsHistory.R")
source("overPlotYearsByMonth.R")

# Matching inspections data
analyzeSanitarians <- function(garrison, portal) {

# decorate Garrison data frame
garrison$year <- year(garrison$Date)
garrison$month <- month(garrison$Date)

logMsg(paste('Unique license numbers in Garrison inspections =', length(unique(garrison$License))))
logMsg(paste('Unique zip codes in Garrison inspections =', length(unique(garrison$Zip))))
logMsg(paste('Unique sanitarian in Garrison inspections =', length(unique(garrison$Inspector))))
summarize(garrison)

logMsg(paste('Unique license numbers in portal inspections =', length(unique(portal$License))))
logMsg(paste('Unique zip codes in portal inspections =', length(unique(portal$Zip))))
summarize(portal)

logMsg('Garrison inspections by sanitarian:')
bySanitarian <- as.matrix(table(garrison$Inspector))
print(bySanitarian)
print(summary(as.integer(bySanitarian)))

logMsg('Garrison inspections by year and month:')
byYearMonth <- table(garrison$year, garrison$month)
print(byYearMonth)
dev.new()
overPlotYearsByMonth(
main="Garrison inspections by year and month",
ylab="Inspections",
ylim=range(byYearMonth)
)
lapply(2011:2014, plotYearByMonth, byYearMonth)
addMonthAxisAndYearsLegend(years=2011:2014)
dev.copy(svg, filename="../out/GarrisonInspectionsByYearAndMonth.svg")
dev.off()

duplicatedGarrison <- duplicated(garrison[ , c("License", "Date")])
logMsg(paste("Removing duplicate inspections from Garrison:", sum(duplicatedGarrison)))
garrison <- garrison[!duplicatedGarrison , ]

duplicatedPortal <- duplicated(portal[ , c("License", "Inspection.Date")])
logMsg(paste("Removing duplicate inspections from portal:", sum(duplicatedPortal)))
portal <- portal[!duplicatedPortal , ]

df <- merge(
subset(garrison, select=-c(Address)),
subset(portal, select=-c(Zip, DBA.Name, AKA.Name, Inspection.Type, City, State, Latitude, Longitude, Location)),
by.x=c("License", "Date"), by.y=c("License", "Inspection.Date"))
duplicatedDf <- duplicated(df[ , c("License", "Date")])
logMsg(paste("Removing duplicate inspections:", sum(duplicatedDf)))
df <- df[!duplicatedDf , ]
logMsg(paste('Unique license numbers in inspections =', length(unique(df$License))))
logMsg(paste('Unique zip codes in inspections =', length(unique(df$Zip))))
logMsg(paste('Unique sanitarians in inspections =', length(unique(df$Inspector))))
summarize(df)

logMsg('Inspections by sanitarian and year:')
bySanitarianAndYear <- table(df$Inspector, df$year)
print(bySanitarianAndYear)
dev.new()
par(las=1) # horizontal axis labels
stripchart(values ~ ind,
data=stack(as.data.frame.matrix(bySanitarianAndYear)),
pch=20,
main="Inspections by sanitarian and year",
xlab="Year",
ylab="Inspections",
vertical=TRUE,
col="blue"
)
apply(bySanitarianAndYear, 1, lines, col="blue")
dev.copy(svg, filename="../out/InspectionsBySanitarianAndYear.svg")
dev.off()

logMsg('Fails by sanitarian and year:')
fails <- df[df$Results == 'Fail' , ]
failsBySanitarianAndYear <- table(fails$Inspector, fails$year)
print(failsBySanitarianAndYear)
dev.new()
par(las=1) # horizontal axis labels
stripchart(values ~ ind,
data=stack(as.data.frame.matrix(failsBySanitarianAndYear)),
pch=20,
main="Fails by sanitarian and year",
xlab="Year",
ylab="Fails",
vertical=TRUE,
col="blue"
)
apply(failsBySanitarianAndYear, 1, lines, col="blue")
dev.copy(svg, filename="../out/FailsBySanitarianAndYear.svg")
dev.off()

logMsg('Fail counts correlation by sanitarian from year-to-year:')
print(cor(failsBySanitarianAndYear, use="pairwise.complete.obs"))
logMsg('Fail counts by sanitarian correlation 2012 to 2013 with confidence interval and p-value:')
print(cor.test(failsBySanitarianAndYear[ , "2013"], failsBySanitarianAndYear[ , "2014"], use="pairwise.complete.obs"))

logMsg('Failure rates by sanitarian and year:')
failureRateBySanitarianAndYear <- failsBySanitarianAndYear / bySanitarianAndYear
print(failureRateBySanitarianAndYear)
dev.new()
par(las=1) # horizontal axis labels
stripchart(values ~ ind,
data=stack(as.data.frame.matrix(failureRateBySanitarianAndYear)),
pch=20,
main="Failure rates by sanitarian and year",
xlab="Year",
ylab="Failure rate",
vertical=TRUE,
col="blue"
)
apply(failureRateBySanitarianAndYear, 1, lines, col="blue")
dev.copy(svg, filename="../out/FailureRatesBySanitarianAndYear.svg")
dev.off()

logMsg('Failure rate correlation by sanitarian from year-to-year:')
print(cor(failureRateBySanitarianAndYear, use="pairwise.complete.obs"))
logMsg('Failure rate by sanitarian correlation 2013 to 2014 with confidence interval and p-value:')
print(cor.test(failureRateBySanitarianAndYear[ , "2013"], failureRateBySanitarianAndYear[ , "2014"], use="pairwise.complete.obs"))

sanitarians <- data.frame(
Inspections2011to2013=rowSums(bySanitarianAndYear[, c("2011", "2012", "2013")]),
Fails2011to2013=rowSums(failsBySanitarianAndYear[, c("2011", "2012", "2013")]),
Inspections2014=bySanitarianAndYear[,"2014"],
Fails2014=failsBySanitarianAndYear[,"2014"]
)
sanitarians$Rate2011to2013 <- sanitarians$Fails2011to2013 / sanitarians$Inspections2011to2013
sanitarians$Rate2014 <- sanitarians$Fails2014 / sanitarians$Inspections2014
print(sanitarians)
logMsg('Failure rate by sanitarian correlation 2013 to 2014 with confidence interval and p-value:')
print(cor.test(sanitarians$Rate2011to2013, sanitarians$Rate2014, use="pairwise.complete.obs"))

logMsg('Done.')
}

run <- function() {
logMsg('Reading Garrison food inspection history')
garrison <- cleanGarrison(readGarrison())
logMsg(paste('Garrison food inspection history records:', nrow(garrison)))

logMsg('Reading portal food inspection history')
portal <- readFoodInspectionsHistory()
logMsg(paste('Portal food inspection history records:', nrow(portal)))

analyzeSanitarians(garrison, portal)
}
14 changes: 14 additions & 0 deletions R/anonymizeInspections.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@

source("utilities.R")
source("readGarrison.R")

run <- function(file="../out/AnonymizedInspectionsGarrisonExport20112014.csv") {
logMsg('Reading Garrison food inspection history')
garrison <- readGarrison()
logMsg(paste('Garrison food inspection history records:', nrow(garrison)))
logMsg('Anonymizing Garrison food inspection history records...')
garrison <- anonymizeColumns(garrison, c("Inspector"))
logMsg('Writing Garrison food inspection history')
write.table(garrison, file=file, sep=',', row.names=FALSE)
logMsg('Done.')
}
40 changes: 40 additions & 0 deletions R/foodInspectionUtilities.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Food inspection utilities
#
# Author: 368982
###############################################################################

# split on vertical bar, then on period after the violation code number, keep the code numbers
parseViolationCodes <- function(violations) {
lapply(lapply(strsplit(violations, "| ", fixed=TRUE), strsplit, ". ", fixed=TRUE), function(x) as.integer(lapply(x, FUN='[', 1)))
}

# critical=1, serious=2, minor=3, other=0
classifyViolations <- function(violations) {
lapply(parseViolationCodes(violations), function(x) ifelse(x %in% 1:14, 1, ifelse(x %in% 15:29, 2, ifelse(x %in% 30:44, 3, 0))))
}

# counts by criticality
# tabulate ignores zeroes
# returns an integer matrix with a row for each inspection, and 3 columns for critical, serious, and minor counts
violationCounts <- function(violations) {
result <- do.call(rbind, lapply(classifyViolations(violations), function(x) tabulate(as.integer(x), nbins=3)))
colnames(result) <- c("critical", "serious", "minor")
result
}

# add violation counts to inspections data frame
countViolations <- function(inspections) {
result <- inspections
violationCounts <- violationCounts(inspections$Violations)

result$criticalCount <- violationCounts[ , "critical"]
result$seriousCount <- violationCounts[ , "serious"]
result$minorCount <- violationCounts[ , "minor"]

result$serious <- (result$seriousCount > 0)
result$critical <- (result$criticalCount > 0)
result$minor <- (result$minorCount > 0)

result
}

115 changes: 115 additions & 0 deletions R/foodInspectionZipCodes.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# TITLE: foodInspectionZipCodes.R
# AUTHOR: Hugh J. Devlin, Ph. D.
# CREATED: 2014-04-07

options(warn=1)
options(error=utils::recover)
options(max.print=15000)
options(width=200)

source("utilities.R")

posixifyColumns <- function(df, columnNames) {
result <- df
for(columnName in columnNames) {
result[[columnName]] <- as.POSIXct(result[[columnName]], format="%m/%d/%Y")
}
result
}

readInspectionHistory <- function(file="../data/Food_Inspections.csv") {
result <- read.csv(file, stringsAsFactors=FALSE, na.strings='')
names(result)[names(result)=="License.."] <- "License" # "License #" on the portal, rename merge column
result$Inspection.Date <- as.POSIXct(result$Inspection.Date, format="%m/%d/%Y")
result
}

readLicenses <- function(file="../data/Business_Licenses.csv") {
result <- read.csv(file, stringsAsFactors=FALSE, na.strings='')
result <- posixifyColumns(result,
c( "APPLICATION.CREATED.DATE",
"APPLICATION.REQUIREMENTS.COMPLETE",
"PAYMENT.DATE",
"LICENSE.TERM.START.DATE",
"LICENSE.TERM.EXPIRATION.DATE",
"LICENSE.APPROVED.FOR.ISSUANCE",
"DATE.ISSUED",
"LICENSE.STATUS.CHANGE.DATE"
))
result
}

# Matching inspections data
compareZips <- function(inspections, licenses, startDate="2014-01-01",
licensesNotFoundPath='../out/licensesNotFound.csv',
zipCodeDiscrepencyPath='../out/foodInspectionZipCodeDiscrepencies.csv') {

#subset food inspections by start date
inspections <- subset(inspections, Inspection.Date >= as.POSIXct(startDate, format="%Y-%m-%d"))
inspections <- inspections[complete.cases(inspections[ , c("License")]) , ]
logMsg(paste('Food inspections after', startDate, ':', nrow(inspections)))
summarize(inspections)

inspectionLicenseNumbers <- unique(inspections$License)
logMsg(paste('Unique license numbers in food inspections =', length(inspectionLicenseNumbers)))
logMsg(paste('Unique zip codes in food inspections =', length(unique(inspections$Zip))))
logMsg('Food inspections by zip code:')
print(sort(table(inspections$Zip), decreasing=TRUE))

# subset to currently active business licenses
licenses <- licenses[complete.cases(licenses[ , c("ACCOUNT.NUMBER", "LICENSE.NUMBER", "LICENSE.CODE", "LICENSE.TERM.START.DATE", "LICENSE.TERM.EXPIRATION.DATE")]) , ]
licenses <- licenses[licenses$LICENSE.STATUS == "AAI" , ]
licenses <- licenses[licenses$LICENSE.TERM.START.DATE <= Sys.time() & Sys.time() <= licenses$LICENSE.TERM.EXPIRATION.DATE , ]
# Keep only the most recent of duplicates
licenses <- licenses[with(licenses, order(ACCOUNT.NUMBER, LICENSE.NUMBER, LICENSE.CODE, LICENSE.TERM.START.DATE)) , ] # sort
licenses <- licenses[!duplicated(licenses$LICENSE.NUMBER, fromLast=TRUE) , ]
summarize(licenses)

licenseNumbers <- unique(licenses$LICENSE.NUMBER)
logMsg(paste('Unique license numbers in business licenses =', length(licenseNumbers)))
logMsg(paste('Unique zip codes in business licenses =', length(unique(licenses$ZIP.CODE))))
logMsg('Licenses by zip code:')
byZipCode <- table(licenses$ZIP.CODE)
print(sort(byZipCode, decreasing=TRUE))

duplicates <- duplicated(licenses$LICENSE.NUMBER, fromLast=FALSE) | duplicated(licenses$LICENSE.NUMBER, fromLast=TRUE)
logMsg(paste('Duplicate active license numbers in business licenses:', sum(duplicates)))

logMsg(paste('Unique license numbers in business licenses =', length(unique(licenses$LICENSE.NUMBER))))

# inspections not found in business licenses
inspectionsNotFound <- inspections[!(inspections$License %in% licenseNumbers) , c("License" , "DBA.Name", "Address", "Inspection.Date", "Results")]
inspectionsNotFound <- inspectionsNotFound[with(inspectionsNotFound, order(License, Inspection.Date)) , ] # sort
inspectionsNotFound <- inspectionsNotFound[!duplicated(inspectionsNotFound$License, fromLast=TRUE) , ] # keep only most recent
inspectionsNotFound <- inspectionsNotFound[inspectionsNotFound$Result != "Business Not Located", ]
inspectionsNotFound <- inspectionsNotFound[inspectionsNotFound$Result != "Out of Business", ]
logMsg(paste('License numbers in inspections not found in business licenses =', nrow(inspectionsNotFound)))
write.table(inspectionsNotFound, file=licensesNotFoundPath, sep=',', row.names=FALSE)

# zip code discrepencies between the inspections and the business licenses
inspections <- inspections[complete.cases(inspections[ , c("Zip")]) , ]
logMsg(paste('Inspections with zip codes =', nrow(inspections)))
licenses <- licenses[complete.cases(licenses[ , c("ZIP.CODE")]) , ]
logMsg(paste('Business licenses with zip codes =', nrow(licenses)))
inspections <- merge(inspections, licenses, by.x="License", by.y="LICENSE.NUMBER")
logMsg(paste('Inspections matching business license on license number =', nrow(inspections)))
inspections <- inspections[(inspections$Zip != inspections$ZIP.CODE) , ]
logMsg(paste('Inspections with zip codes different from business license =', nrow(inspections)))
inspections <- inspections[ , c("License", "Inspection.Date", "Results", "DBA.Name", "Address", "Zip", "ZIP.CODE")]
names(inspections)[names(inspections)=="Zip"] <- "Food Inspection Zip"
names(inspections)[names(inspections)=="ZIP.CODE"] <- "Business License Zip"
inspections <- inspections[with(inspections, order(License, Inspection.Date)) , ] # sort
write.table(inspections, file=zipCodeDiscrepencyPath, sep=',', row.names=FALSE)

logMsg('Done.')
}

run <- function() {
logMsg('Reading food inspections')
fi <- readInspectionHistory()
logMsg(paste('Food inspections:', nrow(fi)))
logMsg('Reading licenses')
li <- readLicenses()
logMsg(paste('Licenses:', nrow(li)))
compareZips(fi, li)
}
Loading

0 comments on commit f3035a2

Please sign in to comment.