diff --git a/docs/paper/BUILD.sh b/docs/paper/BUILD.sh new file mode 100755 index 0000000..4976c55 --- /dev/null +++ b/docs/paper/BUILD.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +bibtex paper + +bibtex paper + +pdflatex paper.tex + + diff --git a/docs/paper/paper.tex b/docs/paper/paper.tex new file mode 100644 index 0000000..7cbb83f --- /dev/null +++ b/docs/paper/paper.tex @@ -0,0 +1,219 @@ +%% +%% This is file `sample-sigconf.tex', +%% generated with the docstrip utility. +%% +%% The original source files were: +%% +%% samples.dtx (with options: `sigconf') +%% +%% IMPORTANT NOTICE: +%% +%% For the copyright see the source file. +%% +%% Any modified versions of this file must be renamed +%% with new filenames distinct from sample-sigconf.tex. +%% +%% For distribution of the original source see the terms +%% for copying and modification in the file samples.dtx. +%% +%% This generated file may be distributed as long as the +%% original source files, as listed above, are part of the +%% same distribution. (The sources need not necessarily be +%% in the same archive or directory.) +%% +%% +%% Commands for TeXCount +%TC:macro \cite [option:text,text] +%TC:macro \citep [option:text,text] +%TC:macro \citet [option:text,text] +%TC:envir table 0 1 +%TC:envir table* 0 1 +%TC:envir tabular [ignore] word +%TC:envir displaymath 0 word +%TC:envir math 0 word +%TC:envir comment 0 0 +%% +%% +%% The first command in your LaTeX source must be the \documentclass command. +\documentclass[sigconf]{acmart} + +%% +%% \BibTeX command to typeset BibTeX logo in the docs +\AtBeginDocument{% + \providecommand\BibTeX{{% + Bib\TeX}}} + +%% Rights management information. This information is sent to you +%% when you complete the rights form. These commands have SAMPLE +%% values in them; it is your responsibility as an author to replace +%% the commands and values with those provided to you when you +%% complete the rights form. +\setcopyright{acmcopyright} +\copyrightyear{2023} +\acmYear{2023} +\acmDOI{XXXXXXX.XXXXXXX} + +%% These commands are for a PROCEEDINGS abstract or paper. +\acmConference[ICCDA '23]{}{July, 2023}{GuiYang, CN} +\acmPrice{15.00} +\acmISBN{978-1-4503-XXXX-X/23/07} + +%% +%% Submission ID. +%% Use this when submitting an article to a sponsored event. You'll +%% receive a unique submission ID from the organizers +%% of the event, and this ID should be used as the parameter to this command. +%%\acmSubmissionID{123-A56-BU3} + +%% +%% For managing citations, it is recommended to use bibliography +%% files in BibTeX format. +%% +%% You can then either use BibTeX with the ACM-Reference-Format style, +%% or BibLaTeX with the acmnumeric or acmauthoryear sytles, that include +%% support for advanced citation of software artefact from the +%% biblatex-software package, also separately available on CTAN. +%% +%% Look at the sample-*-biblatex.tex files for templates showcasing +%% the biblatex styles. +%% + +%% +%% The majority of ACM publications use numbered citations and +%% references. The command \citestyle{authoryear} switches to the +%% "author year" style. +%% +%% If you are preparing content for an event +%% sponsored by ACM SIGGRAPH, you must use the "author year" style of +%% citations and references. +%% Uncommenting +%% the next command will enable that style. +%%\citestyle{acmauthoryear} + +%% +%% end of the preamble, start of the body of the document source. +\begin{document} + +%% +%% The "title" command has an optional parameter, +%% allowing the author to define a "short title" to be used in page headers. +\title{Decoupled Data Science - Tools for flexible, reproducible eScience} + +%% +%% The "author" command and its associated commands are used to define +%% the authors and their affiliations. +%% Of note is the shared affiliation of the first two authors, and the +%% "authornote" and "authornotemark" commands +%% used to denote shared contribution to the research. +\author{John Hawkins} +\email{john.hawkins@Getting-Data-Science-Done.com} +\orcid{1234-5678-9012} +\affiliation{% + \institution{Getting-Data-Science-Done.com} + \city{Sydney} + \state{NSW} + \country{Australia} + \postcode{2000} +} + +\renewcommand{\shortauthors}{Hawkins} + +\begin{abstract} +\end{abstract} + +%% The code below is generated by the tool at http://dl.acm.org/ccs.cfm. +%% Please copy and paste the code instead of the example below. + +\begin{CCSXML} + + +10002951.10003260.10003261 +Information systems~Web searching and information discovery +500 + + +10002951.10003260.10003277.10003280 +Information systems~Web log analysis +500 + + +10010147.10010257 +Computing methodologies~Machine learning +500 + + +\end{CCSXML} + +\ccsdesc[500]{Information systems~Web searching and information discovery} +\ccsdesc[500]{Information systems~Web log analysis} +\ccsdesc[500]{Computing methodologies~Machine learning} + +\keywords{Data Science, Experiment Tracking, Reproducible Science, Metadata Tracking} + +\maketitle + +\section{Introduction} + +Progress in scientific research depends heavily on accurate record keeping of previous +experimental approaches and results. Failure to maintain these records impedes progress +by making it difficult to reproduce work, or imposing the costs of repeatedly testing +failed lines of experimentation. This cost becomes particularly high in the age of the +reproducibility crisis, as a many teams running the same experiments in parallel, without +knowledge of each others work, has produced a factory line of un-reproducible results. + +Many Machine Learning experimentation frameworks focus on the task of making experiments +easier to execute and deploy into production systems\cite{Alberti:2018,MolnerDomenech:2020}. +To do so they are often constructed for specific combinations of technology. This is a necessary limitation +to enable efficiency, but has the effect of limiting general applicability. +Many other eScience frameworks focus on the lineage and management of data, referred to as +the so-called provenance problem \cite{Sahoo:2008,Conquest:2021} +The goal of the provenance frameworks is sufficient auditibiliy of data sources that will +render eScience transparent and repeatable. + +Other frameworks and approaches focus on understanding how to do large scale collaborative science, or +facilitate meta-level learning of various kinds\cite{Hunter:2005,Liu:2023}. The better we track the +process of science as a whole, the better we can understand both how to improve scientific processes +as well as data mine the history of science for phenomena that were difficult to detect. + +\section{Methodology} + +We begin by discussing all desirable elements required of an open science framework. These are drawn +from observations of both how collabroative science works and the successful components of distributed +scientific endeavours. These requirements are drawn from both sciences that are typically dependent +on computational frameworks (computer science, bioinformatics, physics) and those that generally are not +(social science, psychology). + +The key elements are as follows: + +\begin{itemize} + \item Provenance of Data Sources + \item Record of Data Processing + \item Reuse of Datasets + \item Tracking of Experiments and Results + \item Comparison of Methods and Results + \item Generation of Documentation + \item Reproducibility of Projects + \item Facilitation of Meta-Analysis +\end{itemize} + +\subsection{Data} + +\subsection{Features} + +\subsection{Feature Importance} + + +\section{Results} + + + +\section{Conclusion} + + +\section{Acknowledgments} + +\bibliographystyle{ACM-Reference-Format} +\bibliography{refs} + +\end{document} +\endinput diff --git a/docs/paper/refs.bib b/docs/paper/refs.bib new file mode 100644 index 0000000..4952794 --- /dev/null +++ b/docs/paper/refs.bib @@ -0,0 +1,74 @@ +@article{MolnerDomenech:2020, + doi = {10.1088/1742-6596/1603/1/012025}, + url = {https://dx.doi.org/10.1088/1742-6596/1603/1/012025}, + year = {2020}, + month = {sep}, + publisher = {IOP Publishing}, + volume = {1603}, + number = {1}, + pages = {012025}, + author = {Antonio Molner Domenech and Alberto Guillén}, + title = {ml-experiment: A Python framework for reproducible data science}, + journal = {Journal of Physics: Conference Series}, +} + +@INPROCEEDINGS{Conquest:2021, + author={Conquest, Joseph and Stiber, Michael}, + booktitle={2021 IEEE 17th International Conference on eScience (eScience)}, + title={Software and Data Provenance as a Basis for eScience Workflow}, + year={2021}, + volume={}, + number={}, + pages={245-246}, + doi={10.1109/eScience51609.2021.00043} +} + +@article{Alberti:2018, + author = {Alberti, Michele and Pondenkandath, Vinaychandran and Gygli, Marcel and Ingold, Rolf and Liwicki, Marcus}, + year = {2018}, + month = {04}, + pages = {}, + title = {DeepDIVA: A Highly-Functional Python Framework for Reproducible Experiments} +} + +@article{Miksa:2017, + author = {Miksa, Tomasz and Rauber, Andreas}, + year = {2017}, + month = {01}, + pages = {}, + title = {Using ontologies for verification and validation of workflow-based experiments}, + volume = {43}, + journal = {Web Semantics: Science, Services and Agents on the World Wide Web}, + doi = {10.1016/j.websem.2017.01.002} +} + +@article{Sahoo:2008, + author = {Sahoo, Satya and Sheth, Amit and Henson, Cory}, + year = {2008}, + month = {08}, + pages = {46 - 54}, + title = {Semantic Provenance for eScience: Managing the Deluge of Scientific Data}, + volume = {12}, + journal = {Internet Computing, IEEE}, + doi = {10.1109/MIC.2008.86} +} + +@article{Hunter:2005, + author = {Hunter, Jane and Cheung, Kwok}, + year = {2005}, + month = {01}, + pages = {}, + title = {Generating eScience Workflows from Statistical Analysis of Prior Data} +} + +@article{Liu:2023, + author = {Liu, Lu and Jones, Benjamin F. and Uzzi, Brian and Wang, Dashun}, + year = {2023}, + month = {06}, + pages = {2397-3374}, + title = {Data, measurement and empirical methods in the science of science}, + journal = {Nature Human Behaviour}, + doi = {10.1038/s41562-023-01562-4} +} + +