Skip to content

Commit

Permalink
Paper updates
Browse files Browse the repository at this point in the history
  • Loading branch information
john-hawkins committed Jun 19, 2023
1 parent 1a46ed0 commit b94c363
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 40 deletions.
64 changes: 64 additions & 0 deletions docs/paper/Projit_decoupled_process.drawio
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
<mxfile host="app.diagrams.net" modified="2023-06-19T05:01:32.105Z" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36" etag="UbwSm0jIV6gPaTxYB8mX" version="21.5.0" type="device">
<diagram name="Page-1" id="1oxovvhrkwaGEaMZ6Zkm">
<mxGraphModel dx="891" dy="623" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="827" pageHeight="1169" math="0" shadow="0">
<root>
<mxCell id="0" />
<mxCell id="1" parent="0" />
<mxCell id="XCQ5zsaXB3w69v--F8P7-1" value="External Sources" style="shape=datastore;whiteSpace=wrap;html=1;" vertex="1" parent="1">
<mxGeometry x="11" y="20" width="120" height="60" as="geometry" />
</mxCell>
<mxCell id="XCQ5zsaXB3w69v--F8P7-6" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.15;entryY=0.95;entryDx=0;entryDy=0;entryPerimeter=0;strokeColor=#7EA6E0;strokeWidth=5;exitX=1;exitY=0.25;exitDx=0;exitDy=0;" edge="1" parent="1" source="XCQ5zsaXB3w69v--F8P7-2" target="XCQ5zsaXB3w69v--F8P7-4">
<mxGeometry relative="1" as="geometry">
<mxPoint x="331" y="133" as="sourcePoint" />
<mxPoint x="429" y="50" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="XCQ5zsaXB3w69v--F8P7-2" value="Data Prepation" style="swimlane;whiteSpace=wrap;html=1;" vertex="1" parent="1">
<mxGeometry x="21" y="120" width="300" height="80" as="geometry" />
</mxCell>
<mxCell id="XCQ5zsaXB3w69v--F8P7-10" value="" style="shape=flexArrow;endArrow=classic;html=1;rounded=0;fillColor=#9AC7BF;fillStyle=solid;" edge="1" parent="XCQ5zsaXB3w69v--F8P7-2">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="59.5" y="-40" as="sourcePoint" />
<mxPoint x="60" y="30" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="XCQ5zsaXB3w69v--F8P7-11" value="" style="shape=flexArrow;endArrow=classic;html=1;rounded=0;fillColor=#9AC7BF;fillStyle=solid;" edge="1" parent="XCQ5zsaXB3w69v--F8P7-2">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="260" y="30" as="sourcePoint" />
<mxPoint x="259.5" y="-40" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="XCQ5zsaXB3w69v--F8P7-15" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeColor=#B5739D;strokeWidth=5;dashed=1;" edge="1" parent="1" source="XCQ5zsaXB3w69v--F8P7-4" target="XCQ5zsaXB3w69v--F8P7-14">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="XCQ5zsaXB3w69v--F8P7-4" value="Projit Metadata" style="shape=datastore;whiteSpace=wrap;html=1;" vertex="1" parent="1">
<mxGeometry x="360" y="20" width="120" height="60" as="geometry" />
</mxCell>
<mxCell id="XCQ5zsaXB3w69v--F8P7-5" value="Processed Data" style="shape=datastore;whiteSpace=wrap;html=1;" vertex="1" parent="1">
<mxGeometry x="211" y="20" width="120" height="60" as="geometry" />
</mxCell>
<mxCell id="XCQ5zsaXB3w69v--F8P7-7" value="Experimentation" style="swimlane;whiteSpace=wrap;html=1;" vertex="1" parent="1">
<mxGeometry x="21" y="240" width="300" height="80" as="geometry" />
</mxCell>
<mxCell id="XCQ5zsaXB3w69v--F8P7-8" value="Analysis of Results" style="swimlane;whiteSpace=wrap;html=1;" vertex="1" parent="1">
<mxGeometry x="21" y="360" width="300" height="80" as="geometry" />
</mxCell>
<mxCell id="XCQ5zsaXB3w69v--F8P7-12" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.5;entryY=1;entryDx=0;entryDy=0;strokeColor=#7EA6E0;strokeWidth=5;startArrow=classic;startFill=1;exitX=1;exitY=0.25;exitDx=0;exitDy=0;" edge="1" parent="1" source="XCQ5zsaXB3w69v--F8P7-7" target="XCQ5zsaXB3w69v--F8P7-4">
<mxGeometry relative="1" as="geometry">
<mxPoint x="331" y="256" as="sourcePoint" />
<mxPoint x="429" y="200" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="XCQ5zsaXB3w69v--F8P7-13" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.85;entryY=0.95;entryDx=0;entryDy=0;strokeColor=#7EA6E0;strokeWidth=5;entryPerimeter=0;startArrow=classic;startFill=0;exitX=1;exitY=0.25;exitDx=0;exitDy=0;" edge="1" parent="1" source="XCQ5zsaXB3w69v--F8P7-8" target="XCQ5zsaXB3w69v--F8P7-4">
<mxGeometry relative="1" as="geometry">
<mxPoint x="331" y="376" as="sourcePoint" />
<mxPoint x="471" y="200" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="XCQ5zsaXB3w69v--F8P7-14" value="Meta-Analysis" style="swimlane;whiteSpace=wrap;html=1;" vertex="1" parent="1">
<mxGeometry x="520" y="300" width="280" height="80" as="geometry" />
</mxCell>
</root>
</mxGraphModel>
</diagram>
</mxfile>
Binary file added docs/paper/Projit_decoupled_process.drawio.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
89 changes: 49 additions & 40 deletions docs/paper/paper.tex
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@
%%
%% The "title" command has an optional parameter,
%% allowing the author to define a "short title" to be used in page headers.
\title{Decoupled Data Science - Tools for flexible, reproducible eScience}
\title{Projit: An Open Source tool for Decoupled Data Science}

%%
%% The "author" command and its associated commands are used to define
Expand All @@ -118,35 +118,17 @@

\renewcommand{\shortauthors}{Hawkins}

\begin{abstract}
\begin{abstract}
Scientific practice has expanded to become increasingly dependent of digitial technologies,
large scale data processing and advanced analytical methods. These shifts have demanded new
methods of implementing and recording the details of scientific projects. Monolothic applications
have the advantage of a single and consistent design, however they impede the ability of users to
innovate and incrementally improve processes. We discuss the qualities of an ideal eScience framework
for building multi-stage collaboration scinetific workflows and present an open source implementation
for managing scientific processes in a decoupled fashion that permits both flexible implementation of
any stage of processing, and greater ease of meta-data analysis.
\end{abstract}

%% The code below is generated by the tool at http://dl.acm.org/ccs.cfm.
%% Please copy and paste the code instead of the example below.

\begin{CCSXML}
<ccs2012>
<concept>
<concept_id>10002951.10003260.10003261</concept_id>
<concept_desc>Information systems~Web searching and information discovery</concept_desc>
<concept_significance>500</concept_significance>
</concept>
<concept>
<concept_id>10002951.10003260.10003277.10003280</concept_id>
<concept_desc>Information systems~Web log analysis</concept_desc>
<concept_significance>500</concept_significance>
</concept>
<concept>
<concept_id>10010147.10010257</concept_id>
<concept_desc>Computing methodologies~Machine learning</concept_desc>
<concept_significance>500</concept_significance>
</concept>
</ccs2012>
\end{CCSXML}

\ccsdesc[500]{Information systems~Web searching and information discovery}
\ccsdesc[500]{Information systems~Web log analysis}
\ccsdesc[500]{Computing methodologies~Machine learning}

\keywords{Data Science, Experiment Tracking, Reproducible Science, Metadata Tracking}

Expand Down Expand Up @@ -178,29 +160,56 @@ \section{Introduction}
\section{Methodology}

We begin by discussing all desirable elements required of an open science framework. These are drawn
from observations of both how collabroative science works and the successful components of distributed
from observations of both how collaborative science works and the successful components of distributed
scientific endeavours. These requirements are drawn from both sciences that are typically dependent
on computational frameworks (computer science, bioinformatics, physics) and those that generally are not
(social science, psychology).

The key elements are as follows:

\begin{itemize}
\item Provenance of Data Sources
\item Record of Data Processing
\item Reuse of Datasets
\item Tracking of Experiments and Results
\item Comparison of Methods and Results
\item Generation of Documentation
\item Reproducibility of Projects
\item Facilitation of Meta-Analysis
\item Sources: Provenance of Data Sources
\item Processing: Record of Data Processing
\item Reuse: Facilitating Reuse of Datasets
\item Tracking: Tracking of Experiments and outputs
\item Results: Comparison of Methods and Results
\item Documentation: Generation of Documentation
\item Reproduction: Reproducibility of Projects
\item Meta-Analysis: Facilitation of Meta-Analysis
\end{itemize}

\subsection{Data}
The elements in this list are organised in an approximately sequential manner. However, as we discuss
them below it should be apparent that there are many ways in which these elements support each other.
Firstly, and foremost, aata driven projects
require a method of accessing the required \textbf{source} data and will need to maintain records
of this data provenance. There will typically be \textbf{processing} applied to these datasets to
render them applicable to experimentation and analysis. An ideal tool will track the sequential
nature of this processing as well as store information about the location of each resulting dataset.
The data processed in this way is then available for \textbf{reuse} across experiments and analysis,
making \textbf{results} comparable and facilitating \textbf{meta-analysis}.

\subsection{Features}

\subsection{Feature Importance}

\subsection{Projit Process}

The central design principle of \textit{projit} is that the decoupling of data science
should be achieved through utilisation of a simple metadata store. Each aspect of
data science work can be allowed to proceed without awareness of the structure of
any other element as long as it can access the information it requires through this
metadata store.

\begin{figure*}
\includegraphics[scale=0.6]{./Projit_decoupled_process.drawio.png}
\caption{Projit Process for Decoupled Data Science}
\label{fig:projit}
\end{figure*}

In Figure \ref{fig:projit} we see that the core steps of data preparation, experimentation
and analysis of results all happen independently. Each of them accesses the projit store for
the information they need, storing information

\subsection{Application}



\section{Results}
Expand Down
16 changes: 16 additions & 0 deletions docs/source/projit.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@ projit package
Submodules
----------

projit.ascii\_plot module
-------------------------

.. automodule:: projit.ascii_plot
:members:
:undoc-members:
:show-inheritance:

projit.cli module
-----------------

Expand All @@ -20,6 +28,14 @@ projit.config module
:undoc-members:
:show-inheritance:

projit.pdf module
-----------------

.. automodule:: projit.pdf
:members:
:undoc-members:
:show-inheritance:

projit.projit module
--------------------

Expand Down

0 comments on commit b94c363

Please sign in to comment.