Skip to content

Commit

Permalink
Initial paper intro and thoughts
Browse files Browse the repository at this point in the history
  • Loading branch information
john-hawkins committed Jun 7, 2023
1 parent 9fa410e commit 1a46ed0
Show file tree
Hide file tree
Showing 3 changed files with 302 additions and 0 deletions.
9 changes: 9 additions & 0 deletions docs/paper/BUILD.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

bibtex paper

bibtex paper

pdflatex paper.tex


219 changes: 219 additions & 0 deletions docs/paper/paper.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
%%
%% This is file `sample-sigconf.tex',
%% generated with the docstrip utility.
%%
%% The original source files were:
%%
%% samples.dtx (with options: `sigconf')
%%
%% IMPORTANT NOTICE:
%%
%% For the copyright see the source file.
%%
%% Any modified versions of this file must be renamed
%% with new filenames distinct from sample-sigconf.tex.
%%
%% For distribution of the original source see the terms
%% for copying and modification in the file samples.dtx.
%%
%% This generated file may be distributed as long as the
%% original source files, as listed above, are part of the
%% same distribution. (The sources need not necessarily be
%% in the same archive or directory.)
%%
%%
%% Commands for TeXCount
%TC:macro \cite [option:text,text]
%TC:macro \citep [option:text,text]
%TC:macro \citet [option:text,text]
%TC:envir table 0 1
%TC:envir table* 0 1
%TC:envir tabular [ignore] word
%TC:envir displaymath 0 word
%TC:envir math 0 word
%TC:envir comment 0 0
%%
%%
%% The first command in your LaTeX source must be the \documentclass command.
\documentclass[sigconf]{acmart}

%%
%% \BibTeX command to typeset BibTeX logo in the docs
\AtBeginDocument{%
\providecommand\BibTeX{{%
Bib\TeX}}}

%% Rights management information. This information is sent to you
%% when you complete the rights form. These commands have SAMPLE
%% values in them; it is your responsibility as an author to replace
%% the commands and values with those provided to you when you
%% complete the rights form.
\setcopyright{acmcopyright}
\copyrightyear{2023}
\acmYear{2023}
\acmDOI{XXXXXXX.XXXXXXX}

%% These commands are for a PROCEEDINGS abstract or paper.
\acmConference[ICCDA '23]{}{July, 2023}{GuiYang, CN}
\acmPrice{15.00}
\acmISBN{978-1-4503-XXXX-X/23/07}

%%
%% Submission ID.
%% Use this when submitting an article to a sponsored event. You'll
%% receive a unique submission ID from the organizers
%% of the event, and this ID should be used as the parameter to this command.
%%\acmSubmissionID{123-A56-BU3}

%%
%% For managing citations, it is recommended to use bibliography
%% files in BibTeX format.
%%
%% You can then either use BibTeX with the ACM-Reference-Format style,
%% or BibLaTeX with the acmnumeric or acmauthoryear sytles, that include
%% support for advanced citation of software artefact from the
%% biblatex-software package, also separately available on CTAN.
%%
%% Look at the sample-*-biblatex.tex files for templates showcasing
%% the biblatex styles.
%%

%%
%% The majority of ACM publications use numbered citations and
%% references. The command \citestyle{authoryear} switches to the
%% "author year" style.
%%
%% If you are preparing content for an event
%% sponsored by ACM SIGGRAPH, you must use the "author year" style of
%% citations and references.
%% Uncommenting
%% the next command will enable that style.
%%\citestyle{acmauthoryear}

%%
%% end of the preamble, start of the body of the document source.
\begin{document}

%%
%% The "title" command has an optional parameter,
%% allowing the author to define a "short title" to be used in page headers.
\title{Decoupled Data Science - Tools for flexible, reproducible eScience}

%%
%% The "author" command and its associated commands are used to define
%% the authors and their affiliations.
%% Of note is the shared affiliation of the first two authors, and the
%% "authornote" and "authornotemark" commands
%% used to denote shared contribution to the research.
\author{John Hawkins}
\email{[email protected]}
\orcid{1234-5678-9012}
\affiliation{%
\institution{Getting-Data-Science-Done.com}
\city{Sydney}
\state{NSW}
\country{Australia}
\postcode{2000}
}

\renewcommand{\shortauthors}{Hawkins}

\begin{abstract}
\end{abstract}

%% The code below is generated by the tool at http://dl.acm.org/ccs.cfm.
%% Please copy and paste the code instead of the example below.

\begin{CCSXML}
<ccs2012>
<concept>
<concept_id>10002951.10003260.10003261</concept_id>
<concept_desc>Information systems~Web searching and information discovery</concept_desc>
<concept_significance>500</concept_significance>
</concept>
<concept>
<concept_id>10002951.10003260.10003277.10003280</concept_id>
<concept_desc>Information systems~Web log analysis</concept_desc>
<concept_significance>500</concept_significance>
</concept>
<concept>
<concept_id>10010147.10010257</concept_id>
<concept_desc>Computing methodologies~Machine learning</concept_desc>
<concept_significance>500</concept_significance>
</concept>
</ccs2012>
\end{CCSXML}

\ccsdesc[500]{Information systems~Web searching and information discovery}
\ccsdesc[500]{Information systems~Web log analysis}
\ccsdesc[500]{Computing methodologies~Machine learning}

\keywords{Data Science, Experiment Tracking, Reproducible Science, Metadata Tracking}

\maketitle

\section{Introduction}

Progress in scientific research depends heavily on accurate record keeping of previous
experimental approaches and results. Failure to maintain these records impedes progress
by making it difficult to reproduce work, or imposing the costs of repeatedly testing
failed lines of experimentation. This cost becomes particularly high in the age of the
reproducibility crisis, as a many teams running the same experiments in parallel, without
knowledge of each others work, has produced a factory line of un-reproducible results.

Many Machine Learning experimentation frameworks focus on the task of making experiments
easier to execute and deploy into production systems\cite{Alberti:2018,MolnerDomenech:2020}.
To do so they are often constructed for specific combinations of technology. This is a necessary limitation
to enable efficiency, but has the effect of limiting general applicability.
Many other eScience frameworks focus on the lineage and management of data, referred to as
the so-called provenance problem \cite{Sahoo:2008,Conquest:2021}
The goal of the provenance frameworks is sufficient auditibiliy of data sources that will
render eScience transparent and repeatable.

Other frameworks and approaches focus on understanding how to do large scale collaborative science, or
facilitate meta-level learning of various kinds\cite{Hunter:2005,Liu:2023}. The better we track the
process of science as a whole, the better we can understand both how to improve scientific processes
as well as data mine the history of science for phenomena that were difficult to detect.

\section{Methodology}

We begin by discussing all desirable elements required of an open science framework. These are drawn
from observations of both how collabroative science works and the successful components of distributed
scientific endeavours. These requirements are drawn from both sciences that are typically dependent
on computational frameworks (computer science, bioinformatics, physics) and those that generally are not
(social science, psychology).

The key elements are as follows:

\begin{itemize}
\item Provenance of Data Sources
\item Record of Data Processing
\item Reuse of Datasets
\item Tracking of Experiments and Results
\item Comparison of Methods and Results
\item Generation of Documentation
\item Reproducibility of Projects
\item Facilitation of Meta-Analysis
\end{itemize}

\subsection{Data}

\subsection{Features}

\subsection{Feature Importance}


\section{Results}



\section{Conclusion}


\section{Acknowledgments}

\bibliographystyle{ACM-Reference-Format}
\bibliography{refs}

\end{document}
\endinput
74 changes: 74 additions & 0 deletions docs/paper/refs.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
@article{MolnerDomenech:2020,
doi = {10.1088/1742-6596/1603/1/012025},
url = {https://dx.doi.org/10.1088/1742-6596/1603/1/012025},
year = {2020},
month = {sep},
publisher = {IOP Publishing},
volume = {1603},
number = {1},
pages = {012025},
author = {Antonio Molner Domenech and Alberto Guillén},
title = {ml-experiment: A Python framework for reproducible data science},
journal = {Journal of Physics: Conference Series},
}

@INPROCEEDINGS{Conquest:2021,
author={Conquest, Joseph and Stiber, Michael},
booktitle={2021 IEEE 17th International Conference on eScience (eScience)},
title={Software and Data Provenance as a Basis for eScience Workflow},
year={2021},
volume={},
number={},
pages={245-246},
doi={10.1109/eScience51609.2021.00043}
}

@article{Alberti:2018,
author = {Alberti, Michele and Pondenkandath, Vinaychandran and Gygli, Marcel and Ingold, Rolf and Liwicki, Marcus},
year = {2018},
month = {04},
pages = {},
title = {DeepDIVA: A Highly-Functional Python Framework for Reproducible Experiments}
}

@article{Miksa:2017,
author = {Miksa, Tomasz and Rauber, Andreas},
year = {2017},
month = {01},
pages = {},
title = {Using ontologies for verification and validation of workflow-based experiments},
volume = {43},
journal = {Web Semantics: Science, Services and Agents on the World Wide Web},
doi = {10.1016/j.websem.2017.01.002}
}

@article{Sahoo:2008,
author = {Sahoo, Satya and Sheth, Amit and Henson, Cory},
year = {2008},
month = {08},
pages = {46 - 54},
title = {Semantic Provenance for eScience: Managing the Deluge of Scientific Data},
volume = {12},
journal = {Internet Computing, IEEE},
doi = {10.1109/MIC.2008.86}
}

@article{Hunter:2005,
author = {Hunter, Jane and Cheung, Kwok},
year = {2005},
month = {01},
pages = {},
title = {Generating eScience Workflows from Statistical Analysis of Prior Data}
}

@article{Liu:2023,
author = {Liu, Lu and Jones, Benjamin F. and Uzzi, Brian and Wang, Dashun},
year = {2023},
month = {06},
pages = {2397-3374},
title = {Data, measurement and empirical methods in the science of science},
journal = {Nature Human Behaviour},
doi = {10.1038/s41562-023-01562-4}
}


0 comments on commit 1a46ed0

Please sign in to comment.