Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow custom geometry single-cell protocols #734

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions CMakeLists.txt
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ endif()
##
set(Boost_ADDITIONAL_VERSIONS "1.59.0" "1.60.0" "1.61.0" "1.62.0" "1.63.0" "1.64.0" "1.65.0" "1.66.0" "1.67.0" "1.68.0" "1.69.0" "1.70.0" "1.71.0" "1.72.0" "1.73.0" "1.74.0" "1.75.0" "1.76.0" "1.77.0" "1.78.0")
if (NOT BOOST_RECONFIGURE)
find_package(Boost 1.59.0 COMPONENTS iostreams system filesystem timer chrono program_options)
find_package(Boost 1.59.0 COMPONENTS iostreams filesystem system timer chrono program_options regex)
message("BOOST_INCLUDEDIR = ${BOOST_INCLUDEDIR}")
message("BOOST_LIBRARYDIR = ${BOOST_LIBRARYDIR}")
message("Boost_FOUND = ${Boost_FOUND}")
Expand Down Expand Up @@ -459,7 +459,7 @@ if(BOOST_RECONFIGURE)
set(CMAKE_PREFIX_PATH ${CMAKE_CURRENT_SOURCE_DIR}/external/install)
set(Boost_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/external/install/include)
set(Boost_LIBRARY_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/external/install/lib)
find_package(Boost 1.59.0 COMPONENTS iostreams system filesystem timer chrono program_options locale REQUIRED)
find_package(Boost 1.59.0 COMPONENTS iostreams filesystem system timer chrono program_options locale regex REQUIRED)
set(FETCH_BOOST FALSE)
endif()

Expand All @@ -484,7 +484,7 @@ elseif(FETCH_BOOST)
## Let the rest of the build process know we're going to be fetching boost
set(BOOST_LIB_SUBSET --with-iostreams --with-atomic --with-chrono --with-container --with-date_time --with-exception
--with-filesystem --with-graph --with-graph_parallel --with-math
--with-program_options --with-system --with-locale
--with-program_options --with-system --with-locale --with-regex
--with-timer)
set(BOOST_WILL_RECONFIGURE TRUE)
set(FETCH_BOOST FALSE)
Expand Down
9 changes: 8 additions & 1 deletion include/AlevinUtils.hpp
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,9 @@ namespace alevin{
void readWhitelist(bfs::path& filePath,
TrueBcsT& trueBarcodes);

unsigned int hammingDistance(const std::string s1, const std::string s2);
uint32_t hammingDistance(const std::string& s1, const std::string& s2);

void addPadding(std::string& seq, uint32_t max, const char padBases[], uint32_t padLen);

template <typename ProtocolT>
bool processAlevinOpts(AlevinOpts<ProtocolT>& aopt,
Expand Down Expand Up @@ -128,6 +130,11 @@ namespace alevin{
bool hasOneGene(const std::vector<uint32_t>& txps, uint32_t& geneId,
spp::sparse_hash_map<uint32_t, uint32_t>& txpToGeneMap,
const size_t numGenes);

void modifyRegex(size_t readNumber, std::string seq, std::string* reg, uint32_t& nPat);
void modifyRegex(size_t readNumber, std::string seq, std::string* reg, uint32_t& nPat, uint32_t& bioPat);
void modifyRegex(size_t readNumber, customReadpartType type, std::string* reg, itlib::small_vector<uint32_t, 4, 5> *bu, uint32_t& nPat, std::size_t len);
void modifyRegex(size_t readNumber, customReadpartType type, std::string* reg, itlib::small_vector<uint32_t, 4, 5> *bu, uint32_t& nPat, std::size_t first, std::size_t second);
}
}
#endif // __ALEVIN_UTILS_HPP__
56 changes: 56 additions & 0 deletions include/SingleCellProtocols.hpp
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@

#include <string>
#include <ostream>
#include <boost/regex.hpp>

#include "AlevinOpts.hpp"
#include "AlevinTypes.hpp"
#include "pufferfish/itlib/static_vector.hpp"
#include "pufferfish/itlib/small_vector.hpp"


namespace alevin{
namespace protocols {
Expand Down Expand Up @@ -220,7 +223,60 @@ namespace alevin{
BarcodeEnd end;
};

// Custom geometry specification using regex for extraction
struct CustomGeo {
// store regex string for reads 1 and 2
static std::string reg[2];
// store group numbers for bc and umi in regex string groups
static itlib::small_vector<uint32_t, 4, 5> b[2], u[2];
// bioRead stores the read number for biological read and bioPat stores match pattern number on regex
static uint32_t bioRead, bioPat; // biological read would be contigous and on only 1 of the read
static uint32_t minBcLen, maxBcLen, minUmiLen, maxUmiLen;
static bool bioReadFound;
static constexpr const char paddingBases[4] = {'A', 'C', 'G', 'T'};
static constexpr uint32_t padLen = 4;
// store the matches for both reads
boost::smatch match[2];
// store the success of regex search
bool rgx_search[2];
// store the regex
static boost::regex rgx[2];
// string for extract barcode and umi
std::string barcode, um;
// required
static uint32_t barcodeLength, umiLength;
BarcodeEnd end;

TagGeometry umi_geo;
TagGeometry bc_geo;
TagGeometry read_geo;

void set_umi_geo(TagGeometry& g) { umi_geo = g; umiLength = umi_geo.length(); }
void set_bc_geo(TagGeometry& g) { bc_geo = g; barcodeLength = bc_geo.length(); }
void set_read_geo(TagGeometry& g) { read_geo = g; }
uint32_t barcode_length() const { return barcodeLength; }
uint32_t umi_length() const { return umiLength; }
};

}

// CustomGeo type of read part
enum class customReadpartType : char {
bc = 'b',
umi = 'u',
fixed = 'f',
exclude = 'x'
};

// Store the readNumber and type info for CustomGeo
struct ProtoInfo
{
size_t readNumber;
customReadpartType type;
};

}

#endif


8 changes: 8 additions & 0 deletions src/Alevin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1039,6 +1039,7 @@ salmon-based processing of single-cell RNA-seq data.
bool custom_new = (vm.count("bc-geometry") and
vm.count("umi-geometry"));
bool custom = custom_old or custom_new;
bool custom_geo = vm.count("custom-geo");

uint8_t validate_num_protocols {0};
if (dropseq) validate_num_protocols += 1;
Expand All @@ -1054,6 +1055,7 @@ salmon-based processing of single-cell RNA-seq data.
if (quartzseq2) validate_num_protocols += 1;
if (sciseq3) validate_num_protocols += 1;
if (custom) validate_num_protocols += 1;
if (custom_geo) validate_num_protocols += 1;

if ( validate_num_protocols != 1 ) {
fmt::print(stderr, "ERROR: Please specify one and only one scRNA protocol;");
Expand Down Expand Up @@ -1189,6 +1191,12 @@ salmon-based processing of single-cell RNA-seq data.
initiatePipeline(aopt, sopt, orderedOptions,
vm, commentString, noTgMap,
barcodeFiles, readFiles, salmonIndex);
} else if (custom_geo) {
AlevinOpts<apt::CustomGeo> aopt;
//aopt.jointLog->warn("Using Custom Setting for Alevin");
initiatePipeline(aopt, sopt, orderedOptions,
vm, commentString, noTgMap,
barcodeFiles, readFiles, salmonIndex);
}

} catch (po::error& e) {
Expand Down
4 changes: 4 additions & 0 deletions src/AlevinHash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -331,3 +331,7 @@ int salmonHashQuantify(AlevinOpts<apt::CustomGeometry>& aopt,
bfs::path& outputDirectory,
CFreqMapT& freqCounter);

template
int salmonHashQuantify(AlevinOpts<apt::CustomGeo>& aopt,
bfs::path& outputDirectory,
CFreqMapT& freqCounter);
Loading