From b11dc42f2a4a87de21c9416de40be01223c5438b Mon Sep 17 00:00:00 2001 From: Gaurav Date: Wed, 1 Dec 2021 11:26:11 -0500 Subject: [PATCH 01/17] initial custom geometry modifications - successful make --- include/AlevinUtils.hpp | 6 + include/SingleCellProtocols.hpp | 34 ++++++ src/Alevin.cpp | 8 ++ src/AlevinHash.cpp | 4 + src/AlevinUtils.cpp | 189 +++++++++++++++++++++++++++++++- src/CollapsedCellOptimizer.cpp | 10 ++ src/GZipWriter.cpp | 8 ++ src/SalmonAlevin.cpp | 14 +++ src/WhiteList.cpp | 4 + 9 files changed, 276 insertions(+), 1 deletion(-) diff --git a/include/AlevinUtils.hpp b/include/AlevinUtils.hpp index f191d594c..c450dd4c9 100644 --- a/include/AlevinUtils.hpp +++ b/include/AlevinUtils.hpp @@ -128,6 +128,12 @@ namespace alevin{ bool hasOneGene(const std::vector& txps, uint32_t& geneId, spp::sparse_hash_map& txpToGeneMap, const size_t numGenes); + + void modifyRegex(size_t readNumber, std::string seq, std::string* reg, unsigned int& nPat); + void modifyRegex(size_t readNumber, std::string seq, std::string* reg, unsigned int& nPat, unsigned int& bioPat); + void modifyRegex(size_t readNumber, std::string seq, std::string* reg, unsigned int& nPat, unsigned int& bioPat, std::size_t len); + void modifyRegex(size_t readNumber, std::string type, std::string* reg, std::vector *b, std::vector *u, unsigned int& nPat, std::size_t len); + void modifyRegex(size_t readNumber, std::string desc, std::string* reg, std::vector *b, std::vector *u, unsigned int& nPat); } } #endif // __ALEVIN_UTILS_HPP__ diff --git a/include/SingleCellProtocols.hpp b/include/SingleCellProtocols.hpp index 1db3592dc..5bc535039 100644 --- a/include/SingleCellProtocols.hpp +++ b/include/SingleCellProtocols.hpp @@ -3,6 +3,7 @@ #include #include +#include #include "AlevinOpts.hpp" #include "AlevinTypes.hpp" @@ -220,7 +221,40 @@ namespace alevin{ BarcodeEnd end; }; + // Custome geometry specification using regex for extraction + struct CustomGeo { + // store regex for reads 1 and 2 + std::string reg[2] = {"", ""}; + // store positions of matches for bc and umi + std::vector b[2], u[2]; + // bioRead stores the read number for biological read and bioPat stores match pattern number on regex + unsigned int nPatterns = 1, bioRead, bioPat; // biological read would be contigous and on only 1 of the read + bool bioReadFound = false; + // store the matches for both reads + boost::smatch match[2]; + // required + uint32_t barcodeLength, umiLength, maxValue; + BarcodeEnd end; + + TagGeometry umi_geo; + TagGeometry bc_geo; + TagGeometry read_geo; + + void set_umi_geo(TagGeometry& g) { umi_geo = g; umiLength = umi_geo.length(); } + void set_bc_geo(TagGeometry& g) { bc_geo = g; barcodeLength = bc_geo.length(); } + void set_read_geo(TagGeometry& g) { read_geo = g; } + uint32_t barcode_length() const { return barcodeLength; } + uint32_t umi_length() const { return umiLength; } + }; + } + struct ProtoInfo + { + size_t readNumber; + std::string type; + std::string desc; + }; + } #endif diff --git a/src/Alevin.cpp b/src/Alevin.cpp index 8246b7263..a4724994a 100644 --- a/src/Alevin.cpp +++ b/src/Alevin.cpp @@ -1039,6 +1039,7 @@ salmon-based processing of single-cell RNA-seq data. bool custom_new = (vm.count("bc-geometry") and vm.count("umi-geometry")); bool custom = custom_old or custom_new; + bool custom_geo = vm.count("custom-geo"); uint8_t validate_num_protocols {0}; if (dropseq) validate_num_protocols += 1; @@ -1054,6 +1055,7 @@ salmon-based processing of single-cell RNA-seq data. if (quartzseq2) validate_num_protocols += 1; if (sciseq3) validate_num_protocols += 1; if (custom) validate_num_protocols += 1; + if (custom_geo) validate_num_protocols += 1; if ( validate_num_protocols != 1 ) { fmt::print(stderr, "ERROR: Please specify one and only one scRNA protocol;"); @@ -1189,6 +1191,12 @@ salmon-based processing of single-cell RNA-seq data. initiatePipeline(aopt, sopt, orderedOptions, vm, commentString, noTgMap, barcodeFiles, readFiles, salmonIndex); + } else if (custom_geo) { + AlevinOpts aopt; + //aopt.jointLog->warn("Using Custom Setting for Alevin"); + initiatePipeline(aopt, sopt, orderedOptions, + vm, commentString, noTgMap, + barcodeFiles, readFiles, salmonIndex); } } catch (po::error& e) { diff --git a/src/AlevinHash.cpp b/src/AlevinHash.cpp index 8bddb0636..c7f4cc676 100644 --- a/src/AlevinHash.cpp +++ b/src/AlevinHash.cpp @@ -331,3 +331,7 @@ int salmonHashQuantify(AlevinOpts& aopt, bfs::path& outputDirectory, CFreqMapT& freqCounter); +template +int salmonHashQuantify(AlevinOpts& aopt, + bfs::path& outputDirectory, + CFreqMapT& freqCounter); diff --git a/src/AlevinUtils.cpp b/src/AlevinUtils.cpp index a5bad4a9f..4b4727fa3 100644 --- a/src/AlevinUtils.cpp +++ b/src/AlevinUtils.cpp @@ -142,6 +142,15 @@ namespace alevin { //subseq = seq2; } template <> + std::string* getReadSequence(apt::CustomGeo& protocol, + std::string& seq, + std::string& seq2, + std::string& subseq){ + bool ok = protocol.read_geo.extract_read(seq, seq2, subseq); + return &subseq; + //subseq = seq2; + } + template <> std::string* getReadSequence(apt::Gemcode& protocol, std::string& seq, std::string& seq2, @@ -246,6 +255,14 @@ namespace alevin { return pt.umi_geo.extract_tag(read1, read2, umi); } template <> + bool extractUMI(std::string& read1, + std::string& read2, + apt::CustomGeo& pt, + std::string& umi){ + + return false; + } + template <> bool extractUMI(std::string& read, std::string& read2, apt::QuartzSeq2& pt, @@ -407,6 +424,13 @@ namespace alevin { return pt.bc_geo.extract_tag(read1, read2, bc); } template <> + bool extractBarcode(std::string& read1, + std::string& read2, + apt::CustomGeo& pt, + std::string& bc){ + return pt.bc_geo.extract_tag(read1, read2, bc); + } + template <> bool extractBarcode(std::string& read, std::string& read2, apt::QuartzSeq2& pt, @@ -570,6 +594,51 @@ namespace alevin { std::not2(std::equal_to())); } + void modifyRegex(size_t readNumber, std::string desc, std::string* reg, std::vector *b, std::vector *u, unsigned int& nPat){ + if (desc == "b"){ + reg[readNumber-1] += "([ATGC]{1,})"; + b[readNumber-1].push_back(nPat); + nPat++; + } else if (desc == "u"){ + reg[readNumber-1] += "([ATGC]{1,})"; + u[readNumber-1].push_back(nPat); + nPat++; + } +} + + void modifyRegex(size_t readNumber, std::string type, std::string* reg, std::vector *b, std::vector *u, unsigned int& nPat, std::size_t len) + { + if (type == "b"){ + reg[readNumber-1] += "([ATGC]{" + std::to_string(len) +"})"; + b[readNumber-1].push_back(nPat); + nPat++; + } else if (type == "u"){ + reg[readNumber-1] += "([ATGC]{" + std::to_string(len) +"})"; + u[readNumber-1].push_back(nPat); + nPat++; + } + } + + void modifyRegex(size_t readNumber, std::string seq, std::string* reg, unsigned int& nPat) + { + reg[readNumber -1] += "(" + seq + ")"; + nPat++; + } + + void modifyRegex(size_t readNumber, std::string* reg, unsigned int& nPat, unsigned int& bioPat) + { + reg[readNumber - 1] += "([ATGC]{1,})"; + bioPat = nPat; + nPat++; + } + + void modifyRegex(size_t readNumber, std::string* reg, unsigned int& nPat, unsigned int& bioPat, std::size_t len) + { + reg[readNumber-1] += "([ATGC]{" + std::to_string(len) +"})"; + bioPat = nPat; + nPat++; + } + void getTxpToGeneMap(spp::sparse_hash_map& txpToGeneMap, spp::sparse_hash_map& geneIdxMap, const std::string& t2gFileName, @@ -1008,6 +1077,7 @@ namespace alevin { bool have_custom_umi_geo = vm.count("umi-geometry"); bool have_custom_bc_geo = vm.count("bc-geometry"); bool have_custom_read_geo = vm.count("read-geometry"); + bool have_custom_geo = vm.count("custome-geo"); // need both bool have_any_custom_geo = have_custom_read_geo or have_custom_umi_geo or have_custom_bc_geo; bool have_all_custom_geo = have_custom_read_geo and have_custom_umi_geo and have_custom_bc_geo; @@ -1221,7 +1291,120 @@ namespace alevin { // if it's OK, set the umi kmer length alevin::types::AlevinUMIKmer::k( static_cast(umi_geo.length()) ); - } // new custom barcode geometry + } else if (have_custom_geo) { // regex based unified barcode geometry parsing + + struct apt::CustomGeo customGeo; + struct ProtoInfo proto; + peg::parser parser(R"( + Specification <- ReadNumber1'{'Description{1,10}'}'ReadNumber2'{'Description{1,10}'}' + ReadNumber1 <- [1] + ReadNumber2 <- [2] + Description <- Type'['Position']' / Fixed'['Sequence']' / Type + Type <- 'b'i / 'u'i / 'r'i + Fixed <- 'f'i + Sequence <- [ATGC]+ + Position <- (Number '-' Number) / (Number '-' End) / Length + Number <- [0-9]+ + Length <- Number + End <- 'end' + )"); + + + parser["Number"] = [](const peg::SemanticValues &sv) + { + auto val = static_cast(std::stoull(sv.token())); + return val; + }; + + + parser["ReadNumber1"] = [&](const peg::SemanticValues &sv) + { + auto val = std::stoi(sv.token()); + proto.readNumber = val; + }; + + parser["ReadNumber2"] = [&](const peg::SemanticValues &sv) + { + auto val = std::stoi(sv.token()); + proto.readNumber = val; + if(val == 2){ customGeo.nPatterns = 1;} + }; + + parser["Description"] = [&](const peg::SemanticValues &sv) + { + auto val = std::string(sv.token()); + if (val == "b" || val == "u"){ + modifyRegex(proto.readNumber, val, customGeo.reg, customGeo.b, customGeo.u, customGeo.nPatterns); + } else if (val == "r") { + modifyRegex(proto.readNumber, customGeo.reg, customGeo.nPatterns, customGeo.bioPat); + } + }; + + parser["Type"] = [&](const peg::SemanticValues &sv) + { + auto val = std::string(sv.token()); + proto.type = val; + if(val == "r"){ + if (!customGeo.bioReadFound){ + customGeo.bioRead = proto.readNumber; + customGeo.bioReadFound = true; + } else { + std::cout << "Error message: only contigous biological read expected" << std::endl; + } + } + }; + + parser["Fixed"] = [&](const peg::SemanticValues &sv) + { + auto val = std::string(sv.token()); + proto.type = val; + }; + + parser["Sequence"] = [&](const peg::SemanticValues &sv) + { + auto val = std::string(sv.token()); + modifyRegex(proto.readNumber, val, customGeo.reg, customGeo.nPatterns); + }; + + parser["Position"] = [&](const peg::SemanticValues &sv) + { + switch (sv.choice()) + { + case 0: + { + auto val = std::make_pair( + peg::any_cast(sv[0]), peg::any_cast(sv[1])); + if (proto.type == "b" || proto.type == "u") { + modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.b, customGeo.u, customGeo.nPatterns, val.second - val.first + 1); + } else { + modifyRegex(proto.readNumber, customGeo.reg, customGeo.nPatterns, customGeo.bioPat, val.second - val.first + 1); + } + } + break; + case 1: + { + if (proto.type == "b" || proto.type == "u"){ + modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.b, customGeo.u, customGeo.nPatterns); + } else { // when it's r + modifyRegex(proto.readNumber, customGeo.reg, customGeo.nPatterns, customGeo.bioPat); + } + } + break; + case 2: + { + auto val = peg::any_cast(sv[0]); + if (proto.type == "b" || proto.type == "u"){ + modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.b, customGeo.u, customGeo.nPatterns, val); + } else { + modifyRegex(proto.readNumber, customGeo.reg, customGeo.nPatterns, customGeo.bioPat, val); + } + } + break; + } + }; + + return 0; + } //validate specified iupac if (aopt.iupac.size()>0){ @@ -1431,6 +1614,10 @@ namespace alevin { SalmonOpts& sopt, bool noTgMap, boost::program_options::variables_map& vm); template + bool processAlevinOpts(AlevinOpts& aopt, + SalmonOpts& sopt, bool noTgMap, + boost::program_options::variables_map& vm); + template bool processAlevinOpts(AlevinOpts& aopt, SalmonOpts& sopt, bool noTgMap, boost::program_options::variables_map& vm); diff --git a/src/CollapsedCellOptimizer.cpp b/src/CollapsedCellOptimizer.cpp index 5c868864e..bb0b0c90b 100644 --- a/src/CollapsedCellOptimizer.cpp +++ b/src/CollapsedCellOptimizer.cpp @@ -1537,6 +1537,16 @@ bool CollapsedCellOptimizer::optimize(EqMapT& fullEqMap, size_t numLowConfidentBarcode); template +bool CollapsedCellOptimizer::optimize(EqMapT& fullEqMap, + spp::sparse_hash_map& txpToGeneMap, + spp::sparse_hash_map& geneIdxMap, + AlevinOpts& aopt, + GZipWriter& gzw, + std::vector& trueBarcodes, + std::vector& umiCount, + CFreqMapT& freqCounter, + size_t numLowConfidentBarcode); +template bool CollapsedCellOptimizer::optimize(EqMapT& fullEqMap, spp::sparse_hash_map& txpToGeneMap, spp::sparse_hash_map& geneIdxMap, diff --git a/src/GZipWriter.cpp b/src/GZipWriter.cpp index 0fe3e4ff2..c05bf0fa9 100644 --- a/src/GZipWriter.cpp +++ b/src/GZipWriter.cpp @@ -1912,6 +1912,10 @@ template bool GZipWriter::writeEquivCounts( const AlevinOpts& aopts, SCExpT& readExp); +template +bool GZipWriter::writeEquivCounts( + const AlevinOpts& aopts, + SCExpT& readExp); template bool GZipWriter::writeMetaAlevin(const AlevinOpts& opts, boost::filesystem::path aux_dir); @@ -1956,4 +1960,8 @@ template bool GZipWriter::writeMetaAlevin(const AlevinOpts& opts, boost::filesystem::path aux_dir); +template bool +GZipWriter::writeMetaAlevin(const AlevinOpts& opts, + boost::filesystem::path aux_dir); + diff --git a/src/SalmonAlevin.cpp b/src/SalmonAlevin.cpp index e5f45d988..ab03ab2a3 100644 --- a/src/SalmonAlevin.cpp +++ b/src/SalmonAlevin.cpp @@ -3221,3 +3221,17 @@ alevinQuant(AlevinOpts& aopt, SalmonOpts& sopt, boost::program_options::parsed_options& orderedOptions, CFreqMapT& freqCounter, size_t numLowConfidentBarcode, std::unique_ptr& salmonIndex); + +template int +alevin_sc_align(AlevinOpts& aopt, SalmonOpts& sopt, + boost::program_options::parsed_options& orderedOptions, + std::unique_ptr& salmonIndex); + +template int +alevinQuant(AlevinOpts& aopt, SalmonOpts& sopt, + SoftMapT& barcodeMap, TrueBcsT& trueBarcodes, + spp::sparse_hash_map& txpToGeneMap, + spp::sparse_hash_map& geneIdxMap, + boost::program_options::parsed_options& orderedOptions, + CFreqMapT& freqCounter, size_t numLowConfidentBarcode, + std::unique_ptr& salmonIndex); \ No newline at end of file diff --git a/src/WhiteList.cpp b/src/WhiteList.cpp index fd6559811..57ddc19c7 100644 --- a/src/WhiteList.cpp +++ b/src/WhiteList.cpp @@ -308,5 +308,9 @@ namespace alevin { std::vector& trueBarcodes, bool useRibo, bool useMito, size_t numLowConfidentBarcode); + template bool performWhitelisting(AlevinOpts& aopt, + std::vector& trueBarcodes, + bool useRibo, bool useMito, + size_t numLowConfidentBarcode); } } From 12dda2c59d88b61cc02ed3a9e7be8ecb9e725725 Mon Sep 17 00:00:00 2001 From: Gaurav Date: Tue, 21 Dec 2021 15:22:02 -0500 Subject: [PATCH 02/17] add custom barcode extraction --- CMakeLists.txt | 6 ++--- include/AlevinUtils.hpp | 0 include/SingleCellProtocols.hpp | 8 +++--- src/AlevinUtils.cpp | 46 ++++++++++++++++++++++++++++++--- src/CMakeLists.txt | 3 ++- src/ProgramOptionsGenerator.cpp | 3 +++ 6 files changed, 55 insertions(+), 11 deletions(-) mode change 100644 => 100755 CMakeLists.txt mode change 100644 => 100755 include/AlevinUtils.hpp mode change 100644 => 100755 include/SingleCellProtocols.hpp mode change 100644 => 100755 src/AlevinUtils.cpp mode change 100644 => 100755 src/CMakeLists.txt mode change 100644 => 100755 src/ProgramOptionsGenerator.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt old mode 100644 new mode 100755 index cf2061e7a..b57fc3f7c --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -430,7 +430,7 @@ endif() ## set(Boost_ADDITIONAL_VERSIONS "1.59.0" "1.60.0" "1.61.0" "1.62.0" "1.63.0" "1.64.0" "1.65.0" "1.66.0" "1.67.0" "1.68.0" "1.69.0" "1.70.0" "1.71.0" "1.72.0" "1.73.0" "1.74.0" "1.75.0" "1.76.0" "1.77.0" "1.78.0") if (NOT BOOST_RECONFIGURE) -find_package(Boost 1.59.0 COMPONENTS iostreams system filesystem timer chrono program_options) +find_package(Boost 1.59.0 COMPONENTS iostreams filesystem system timer chrono program_options regex) message("BOOST_INCLUDEDIR = ${BOOST_INCLUDEDIR}") message("BOOST_LIBRARYDIR = ${BOOST_LIBRARYDIR}") message("Boost_FOUND = ${Boost_FOUND}") @@ -459,7 +459,7 @@ if(BOOST_RECONFIGURE) set(CMAKE_PREFIX_PATH ${CMAKE_CURRENT_SOURCE_DIR}/external/install) set(Boost_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/external/install/include) set(Boost_LIBRARY_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/external/install/lib) - find_package(Boost 1.59.0 COMPONENTS iostreams system filesystem timer chrono program_options locale REQUIRED) + find_package(Boost 1.59.0 COMPONENTS iostreams filesystem system timer chrono program_options locale regex REQUIRED) set(FETCH_BOOST FALSE) endif() @@ -484,7 +484,7 @@ elseif(FETCH_BOOST) ## Let the rest of the build process know we're going to be fetching boost set(BOOST_LIB_SUBSET --with-iostreams --with-atomic --with-chrono --with-container --with-date_time --with-exception --with-filesystem --with-graph --with-graph_parallel --with-math - --with-program_options --with-system --with-locale + --with-program_options --with-system --with-locale --with-regex --with-timer) set(BOOST_WILL_RECONFIGURE TRUE) set(FETCH_BOOST FALSE) diff --git a/include/AlevinUtils.hpp b/include/AlevinUtils.hpp old mode 100644 new mode 100755 diff --git a/include/SingleCellProtocols.hpp b/include/SingleCellProtocols.hpp old mode 100644 new mode 100755 index 5bc535039..270f6bfbf --- a/include/SingleCellProtocols.hpp +++ b/include/SingleCellProtocols.hpp @@ -224,12 +224,12 @@ namespace alevin{ // Custome geometry specification using regex for extraction struct CustomGeo { // store regex for reads 1 and 2 - std::string reg[2] = {"", ""}; + static std::string reg[2]; // store positions of matches for bc and umi - std::vector b[2], u[2]; + static std::vector b[2], u[2]; // bioRead stores the read number for biological read and bioPat stores match pattern number on regex - unsigned int nPatterns = 1, bioRead, bioPat; // biological read would be contigous and on only 1 of the read - bool bioReadFound = false; + static unsigned int nPatterns, bioRead, bioPat; // biological read would be contigous and on only 1 of the read + static bool bioReadFound; // store the matches for both reads boost::smatch match[2]; // required diff --git a/src/AlevinUtils.cpp b/src/AlevinUtils.cpp old mode 100644 new mode 100755 index 4b4727fa3..24d2ab86e --- a/src/AlevinUtils.cpp +++ b/src/AlevinUtils.cpp @@ -1,5 +1,15 @@ #include "AlevinUtils.hpp" #include "peglib.h" +#include +// #include + +// store regex for reads 1 and 2 + std::string alevin::protocols::CustomGeo::reg[2]; + // store positions of matches for bc and umi + std::vector alevin::protocols::CustomGeo::b[2], alevin::protocols::CustomGeo::u[2]; + // bioRead stores the read number for biological read and bioPat stores match pattern number on regex + unsigned int alevin::protocols::CustomGeo::nPatterns, alevin::protocols::CustomGeo::bioRead, alevin::protocols::CustomGeo::bioPat; // biological read would be contigous and on only 1 of the read + bool alevin::protocols::CustomGeo::bioReadFound; namespace alevin { namespace utils { @@ -428,7 +438,23 @@ namespace alevin { std::string& read2, apt::CustomGeo& pt, std::string& bc){ - return pt.bc_geo.extract_tag(read1, read2, bc); + std::cout << "extract barcode called"< bool extractBarcode(std::string& read, @@ -1077,7 +1103,7 @@ namespace alevin { bool have_custom_umi_geo = vm.count("umi-geometry"); bool have_custom_bc_geo = vm.count("bc-geometry"); bool have_custom_read_geo = vm.count("read-geometry"); - bool have_custom_geo = vm.count("custome-geo"); + bool have_custom_geo = vm.count("custom-geo"); // need both bool have_any_custom_geo = have_custom_read_geo or have_custom_umi_geo or have_custom_bc_geo; bool have_all_custom_geo = have_custom_read_geo and have_custom_umi_geo and have_custom_bc_geo; @@ -1294,6 +1320,10 @@ namespace alevin { } else if (have_custom_geo) { // regex based unified barcode geometry parsing struct apt::CustomGeo customGeo; + customGeo.reg[0] = ""; + customGeo.reg[1] = ""; + customGeo.nPatterns = 1; + customGeo.bioReadFound = false; struct ProtoInfo proto; peg::parser parser(R"( Specification <- ReadNumber1'{'Description{1,10}'}'ReadNumber2'{'Description{1,10}'}' @@ -1403,7 +1433,17 @@ namespace alevin { } }; - return 0; + // NOTE: this is just for backwards-compatibility. + // The barcode end is redundant with the new geometry + // specification. + aopt.protocol.end = BarcodeEnd::FIVE; + parser.enable_packrat_parsing(); + std::string geometry = vm["custom-geo"].as(); + auto val = parser.parse(geometry.c_str()); + assert(val == true); + std::cout << "reg[0]: " << customGeo.reg[0] << std::endl; + std::cout << "reg[1]: " << customGeo.reg[1] << std::endl; + } //validate specified iupac diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt old mode 100644 new mode 100755 index 6ac84212e..cfe066e2c --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -234,6 +234,7 @@ target_link_libraries(salmon graphdump ntcard gff + alevin_core ${Boost_LIBRARIES} ${ICU_LIBS} ${STADEN_LIBRARIES} ${CURL_LIBRARIES} @@ -244,7 +245,7 @@ target_link_libraries(salmon ${LIBSALMON_LINKER_FLAGS} ${NON_APPLECLANG_LIBS} ksw2pp - alevin_core +## PUFF_INTEGRATION ${ASAN_LIB} ${FAST_MALLOC_LIB} TBB::tbb diff --git a/src/ProgramOptionsGenerator.cpp b/src/ProgramOptionsGenerator.cpp old mode 100644 new mode 100755 index e7f4da381..10150db82 --- a/src/ProgramOptionsGenerator.cpp +++ b/src/ProgramOptionsGenerator.cpp @@ -467,6 +467,9 @@ namespace salmon { ("umi-geometry", po::value(), "format string describing the genometry of the umi" ) + ("custom-geo", po::value(), + "unified custom geometry" + ) ( "end",po::value(), "Cell-Barcodes end (5 or 3) location in the read sequence from where barcode has to" From 25385fad93d0f9ccd716ab9dd62112051d1c15a6 Mon Sep 17 00:00:00 2001 From: Gaurav Date: Thu, 23 Dec 2021 14:15:36 -0500 Subject: [PATCH 03/17] return read sequence --- src/AlevinUtils.cpp | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/AlevinUtils.cpp b/src/AlevinUtils.cpp index 24d2ab86e..1de51fcb9 100755 --- a/src/AlevinUtils.cpp +++ b/src/AlevinUtils.cpp @@ -156,9 +156,11 @@ namespace alevin { std::string& seq, std::string& seq2, std::string& subseq){ - bool ok = protocol.read_geo.extract_read(seq, seq2, subseq); - return &subseq; - //subseq = seq2; + if(protocol.bioRead == 1) { + return &seq; + } else { + return &seq2; + } } template <> std::string* getReadSequence(apt::Gemcode& protocol, @@ -269,8 +271,15 @@ namespace alevin { std::string& read2, apt::CustomGeo& pt, std::string& umi){ - - return false; + std::string um = ""; + for(int r=0; r < 2; r++) { + for(int i : pt.u[r]) { + um += pt.match[r][i]; + } + } + umi = um; + std::cout << "UMI: "<< umi < bool extractUMI(std::string& read, @@ -438,8 +447,6 @@ namespace alevin { std::string& read2, apt::CustomGeo& pt, std::string& bc){ - std::cout << "extract barcode called"< Date: Wed, 12 Jan 2022 21:35:08 -0500 Subject: [PATCH 04/17] update modifyRegex and parsing: - introduce exclude 'x' to parsing - change positions with lengths --- src/AlevinUtils.cpp | 154 +++++++++++++++++++++++--------------------- 1 file changed, 80 insertions(+), 74 deletions(-) diff --git a/src/AlevinUtils.cpp b/src/AlevinUtils.cpp index 1de51fcb9..98e009946 100755 --- a/src/AlevinUtils.cpp +++ b/src/AlevinUtils.cpp @@ -8,7 +8,9 @@ // store positions of matches for bc and umi std::vector alevin::protocols::CustomGeo::b[2], alevin::protocols::CustomGeo::u[2]; // bioRead stores the read number for biological read and bioPat stores match pattern number on regex - unsigned int alevin::protocols::CustomGeo::nPatterns, alevin::protocols::CustomGeo::bioRead, alevin::protocols::CustomGeo::bioPat; // biological read would be contigous and on only 1 of the read + unsigned int alevin::protocols::CustomGeo::bioRead, alevin::protocols::CustomGeo::bioPat; // biological read would be contigous and on only 1 of the read + unsigned int alevin::protocols::CustomGeo::minBcLen, alevin::protocols::CustomGeo::maxBcLen; + unsigned int alevin::protocols::CustomGeo::minUmiLen, alevin::protocols::CustomGeo::maxUmiLen; bool alevin::protocols::CustomGeo::bioReadFound; namespace alevin { @@ -626,49 +628,39 @@ namespace alevin { std::not2(std::equal_to())); } - void modifyRegex(size_t readNumber, std::string desc, std::string* reg, std::vector *b, std::vector *u, unsigned int& nPat){ - if (desc == "b"){ - reg[readNumber-1] += "([ATGC]{1,})"; - b[readNumber-1].push_back(nPat); - nPat++; - } else if (desc == "u"){ - reg[readNumber-1] += "([ATGC]{1,})"; - u[readNumber-1].push_back(nPat); - nPat++; + void modifyRegex(size_t readNumber, std::string type, std::string* reg, std::vector *bu, unsigned int& nPat, std::size_t first, std::size_t second) + { + reg[readNumber-1] += "([ATGC]{" + std::to_string(first) + "," + std::to_string(second) +"})"; + if (type == "b"){ + bu[readNumber-1].push_back(nPat); + } else if (type == "u") { + bu[readNumber-1].push_back(nPat); + } // else if (type == "x") { } not needed to explicitly mention case 'x' + nPat++; } -} - void modifyRegex(size_t readNumber, std::string type, std::string* reg, std::vector *b, std::vector *u, unsigned int& nPat, std::size_t len) + void modifyRegex(size_t readNumber, std::string type, std::string* reg, std::vector *bu, unsigned int& nPat, std::size_t len) { - if (type == "b"){ - reg[readNumber-1] += "([ATGC]{" + std::to_string(len) +"})"; - b[readNumber-1].push_back(nPat); - nPat++; - } else if (type == "u"){ - reg[readNumber-1] += "([ATGC]{" + std::to_string(len) +"})"; - u[readNumber-1].push_back(nPat); - nPat++; - } + reg[readNumber-1] += "([ATGC]{" + std::to_string(len) +"})"; + if (type == "b"){ + bu[readNumber-1].push_back(nPat); + } else if (type == "u"){ + bu[readNumber-1].push_back(nPat); + } + nPat++; } void modifyRegex(size_t readNumber, std::string seq, std::string* reg, unsigned int& nPat) { - reg[readNumber -1] += "(" + seq + ")"; - nPat++; + reg[readNumber -1] += "(" + seq + ")"; + nPat++; } void modifyRegex(size_t readNumber, std::string* reg, unsigned int& nPat, unsigned int& bioPat) { - reg[readNumber - 1] += "([ATGC]{1,})"; - bioPat = nPat; - nPat++; - } - - void modifyRegex(size_t readNumber, std::string* reg, unsigned int& nPat, unsigned int& bioPat, std::size_t len) - { - reg[readNumber-1] += "([ATGC]{" + std::to_string(len) +"})"; - bioPat = nPat; - nPat++; + reg[readNumber - 1] += "([ATGC]{1,})"; + bioPat = nPat; + nPat++; } void getTxpToGeneMap(spp::sparse_hash_map& txpToGeneMap, @@ -1328,27 +1320,31 @@ namespace alevin { struct apt::CustomGeo customGeo; customGeo.reg[0] = ""; customGeo.reg[1] = ""; - customGeo.nPatterns = 1; + unsigned int nPatterns = 1; + customGeo.minBcLen = 0, customGeo.maxBcLen = 0, customGeo.minUmiLen = 0, customGeo.minUmiLen = 0; customGeo.bioReadFound = false; struct ProtoInfo proto; peg::parser parser(R"( Specification <- ReadNumber1'{'Description{1,10}'}'ReadNumber2'{'Description{1,10}'}' ReadNumber1 <- [1] ReadNumber2 <- [2] - Description <- Type'['Position']' / Fixed'['Sequence']' / Type - Type <- 'b'i / 'u'i / 'r'i - Fixed <- 'f'i + Description <- Type'['Lengths']' / Fixed'['Sequence']' / Read + Type <- 'b' / 'u' / 'x' + Fixed <- 'f' + Read <- 'r' Sequence <- [ATGC]+ - Position <- (Number '-' Number) / (Number '-' End) / Length - Number <- [0-9]+ - Length <- Number - End <- 'end' + Lengths <- (Length '-' Length) / Length + Length <- [0-9]+ )"); - parser["Number"] = [](const peg::SemanticValues &sv) + parser["Length"] = [](const peg::SemanticValues &sv) { auto val = static_cast(std::stoull(sv.token())); + if(val<=0){ + std::cerr << "Lengths should be > 0. Exiting." << std::endl; + exit(1); + }; return val; }; @@ -1363,16 +1359,22 @@ namespace alevin { { auto val = std::stoi(sv.token()); proto.readNumber = val; - if(val == 2){ customGeo.nPatterns = 1;} + nPatterns = 1; }; parser["Description"] = [&](const peg::SemanticValues &sv) { auto val = std::string(sv.token()); - if (val == "b" || val == "u"){ - modifyRegex(proto.readNumber, val, customGeo.reg, customGeo.b, customGeo.u, customGeo.nPatterns); - } else if (val == "r") { - modifyRegex(proto.readNumber, customGeo.reg, customGeo.nPatterns, customGeo.bioPat); + if(val == "r") { + if (!customGeo.bioReadFound){ + customGeo.bioRead = proto.readNumber; + customGeo.bioReadFound = true; + } else { + // aopt.jointLog->error("Only contigous biological read expected.\nExiting now."); + std::cerr << "Only contigous biological read expected.\nExiting now." << std::endl; + exit(1); + } + modifyRegex(proto.readNumber, customGeo.reg, nPatterns, customGeo.bioPat); } }; @@ -1380,14 +1382,6 @@ namespace alevin { { auto val = std::string(sv.token()); proto.type = val; - if(val == "r"){ - if (!customGeo.bioReadFound){ - customGeo.bioRead = proto.readNumber; - customGeo.bioReadFound = true; - } else { - std::cout << "Error message: only contigous biological read expected" << std::endl; - } - } }; parser["Fixed"] = [&](const peg::SemanticValues &sv) @@ -1399,10 +1393,10 @@ namespace alevin { parser["Sequence"] = [&](const peg::SemanticValues &sv) { auto val = std::string(sv.token()); - modifyRegex(proto.readNumber, val, customGeo.reg, customGeo.nPatterns); + modifyRegex(proto.readNumber, val, customGeo.reg, nPatterns); }; - parser["Position"] = [&](const peg::SemanticValues &sv) + parser["Lengths"] = [&](const peg::SemanticValues &sv) { switch (sv.choice()) { @@ -1410,29 +1404,37 @@ namespace alevin { { auto val = std::make_pair( peg::any_cast(sv[0]), peg::any_cast(sv[1])); - if (proto.type == "b" || proto.type == "u") { - modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.b, customGeo.u, customGeo.nPatterns, val.second - val.first + 1); + if(val.second <= val.first) { + // aopt.jointLog->error("In length range [X-Y], Y should be > X.\nExiting now"); + std::cerr << "In length range [X-Y], Y should be > X.\nExiting now" << std::endl; + exit(1); + } + if (proto.type == "b") { + customGeo.minBcLen += val.first; + customGeo.maxBcLen += val.second; + modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.b, nPatterns, val.first, val.second); + } else if (proto.type == "u") { + customGeo.minUmiLen += val.first; + customGeo.maxUmiLen += val.second; + modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.u, nPatterns, val.first, val.second); } else { - modifyRegex(proto.readNumber, customGeo.reg, customGeo.nPatterns, customGeo.bioPat, val.second - val.first + 1); + modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.u, nPatterns, val.first, val.second); // case 'x' } } break; case 1: - { - if (proto.type == "b" || proto.type == "u"){ - modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.b, customGeo.u, customGeo.nPatterns); - } else { // when it's r - modifyRegex(proto.readNumber, customGeo.reg, customGeo.nPatterns, customGeo.bioPat); - } - } - break; - case 2: { auto val = peg::any_cast(sv[0]); - if (proto.type == "b" || proto.type == "u"){ - modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.b, customGeo.u, customGeo.nPatterns, val); + if (proto.type == "b") { + customGeo.minBcLen += val; // a fixed length bc increases the min length too, not updating can cause logical error when there are 2 pos of bcs + customGeo.maxBcLen += val; + modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.b, nPatterns, val); + } else if (proto.type == "u"){ + customGeo.minUmiLen += val; + customGeo.maxUmiLen += val; + modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.u, nPatterns, val); } else { - modifyRegex(proto.readNumber, customGeo.reg, customGeo.nPatterns, customGeo.bioPat, val); + modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.u, nPatterns, val); // case 'x' } } break; @@ -1446,10 +1448,14 @@ namespace alevin { parser.enable_packrat_parsing(); std::string geometry = vm["custom-geo"].as(); auto val = parser.parse(geometry.c_str()); - assert(val == true); + std::cout << "val: " << val << std::endl; + if(val == 0) { + std::cerr << "Incorrect custom geometry specification" << std::endl; // aopt.jointLog->error didn't work, need to check + exit(1); + } std::cout << "reg[0]: " << customGeo.reg[0] << std::endl; std::cout << "reg[1]: " << customGeo.reg[1] << std::endl; - + alevin::types::AlevinUMIKmer::k( static_cast(customGeo.maxUmiLen) ); } //validate specified iupac From 50decde327c064f1d0de8cce2e44e37ae88dafbc Mon Sep 17 00:00:00 2001 From: Gaurav Date: Wed, 12 Jan 2022 23:13:18 -0500 Subject: [PATCH 05/17] add more checks to custom geo --- include/AlevinUtils.hpp | 5 ++--- src/AlevinUtils.cpp | 28 ++++++++++++++++++++++++---- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/include/AlevinUtils.hpp b/include/AlevinUtils.hpp index c450dd4c9..fb27b371c 100755 --- a/include/AlevinUtils.hpp +++ b/include/AlevinUtils.hpp @@ -131,9 +131,8 @@ namespace alevin{ void modifyRegex(size_t readNumber, std::string seq, std::string* reg, unsigned int& nPat); void modifyRegex(size_t readNumber, std::string seq, std::string* reg, unsigned int& nPat, unsigned int& bioPat); - void modifyRegex(size_t readNumber, std::string seq, std::string* reg, unsigned int& nPat, unsigned int& bioPat, std::size_t len); - void modifyRegex(size_t readNumber, std::string type, std::string* reg, std::vector *b, std::vector *u, unsigned int& nPat, std::size_t len); - void modifyRegex(size_t readNumber, std::string desc, std::string* reg, std::vector *b, std::vector *u, unsigned int& nPat); + void modifyRegex(size_t readNumber, std::string type, std::string* reg, std::vector *bu, unsigned int& nPat, std::size_t len); + void modifyRegex(size_t readNumber, std::string type, std::string* reg, std::vector *bu, unsigned int& nPat, std::size_t first, std::size_t second); } } #endif // __ALEVIN_UTILS_HPP__ diff --git a/src/AlevinUtils.cpp b/src/AlevinUtils.cpp index 98e009946..dbdbd6cb8 100755 --- a/src/AlevinUtils.cpp +++ b/src/AlevinUtils.cpp @@ -9,8 +9,8 @@ std::vector alevin::protocols::CustomGeo::b[2], alevin::protocols::CustomGeo::u[2]; // bioRead stores the read number for biological read and bioPat stores match pattern number on regex unsigned int alevin::protocols::CustomGeo::bioRead, alevin::protocols::CustomGeo::bioPat; // biological read would be contigous and on only 1 of the read - unsigned int alevin::protocols::CustomGeo::minBcLen, alevin::protocols::CustomGeo::maxBcLen; - unsigned int alevin::protocols::CustomGeo::minUmiLen, alevin::protocols::CustomGeo::maxUmiLen; + uint32_t alevin::protocols::CustomGeo::minBcLen, alevin::protocols::CustomGeo::maxBcLen; + uint32_t alevin::protocols::CustomGeo::minUmiLen, alevin::protocols::CustomGeo::maxUmiLen; bool alevin::protocols::CustomGeo::bioReadFound; namespace alevin { @@ -1450,11 +1450,31 @@ namespace alevin { auto val = parser.parse(geometry.c_str()); std::cout << "val: " << val << std::endl; if(val == 0) { - std::cerr << "Incorrect custom geometry specification" << std::endl; // aopt.jointLog->error didn't work, need to check - exit(1); + aopt.jointLog->error("Incorrect geometry spec: {} \n" + "Exiting now", geometry); + return false; } std::cout << "reg[0]: " << customGeo.reg[0] << std::endl; std::cout << "reg[1]: " << customGeo.reg[1] << std::endl; + + // validate that BC and UMI lengths are OK + uint32_t maxBC{31}; + uint32_t maxUMI{31}; + // the barcode length must be in [1,31] + if ((customGeo.minBcLen < 1) or (customGeo.maxBcLen > maxBC)) { + aopt.jointLog->error("Barcode length ({}) was not in the required length range [1, {}].\n" + "Exiting now.", customGeo.maxBcLen, maxBC); + return false; + } + // if it's OK, set the barcode kmer length + alevin::types::AlevinCellBarcodeKmer::k( static_cast(customGeo.maxBcLen) ); + // the UMI length must be in [1,31] + if ((customGeo.minUmiLen < 1) or (customGeo.maxUmiLen > maxUMI)) { + aopt.jointLog->error("UMI length ({}) was not in the required length range [1, {}].\n" + "Exiting now.", customGeo.maxUmiLen, maxUMI); + return false; + } + // if it's OK, set the umi kmer length alevin::types::AlevinUMIKmer::k( static_cast(customGeo.maxUmiLen) ); } From 52e76b012c4246e9e4703999a580251d3f329f12 Mon Sep 17 00:00:00 2001 From: Gaurav Date: Thu, 13 Jan 2022 00:31:12 -0500 Subject: [PATCH 06/17] add padding for bc and umi seqs --- include/AlevinUtils.hpp | 2 ++ include/SingleCellProtocols.hpp | 9 +++++++-- src/AlevinUtils.cpp | 36 +++++++++++++++++++++++++++++---- 3 files changed, 41 insertions(+), 6 deletions(-) diff --git a/include/AlevinUtils.hpp b/include/AlevinUtils.hpp index fb27b371c..dd49b2f6e 100755 --- a/include/AlevinUtils.hpp +++ b/include/AlevinUtils.hpp @@ -74,6 +74,8 @@ namespace alevin{ TrueBcsT& trueBarcodes); unsigned int hammingDistance(const std::string s1, const std::string s2); + + void addPadding(std::string& seq, uint32_t max, std::string padBases, unsigned int padLen); template bool processAlevinOpts(AlevinOpts& aopt, diff --git a/include/SingleCellProtocols.hpp b/include/SingleCellProtocols.hpp index 270f6bfbf..039989fff 100755 --- a/include/SingleCellProtocols.hpp +++ b/include/SingleCellProtocols.hpp @@ -228,12 +228,15 @@ namespace alevin{ // store positions of matches for bc and umi static std::vector b[2], u[2]; // bioRead stores the read number for biological read and bioPat stores match pattern number on regex - static unsigned int nPatterns, bioRead, bioPat; // biological read would be contigous and on only 1 of the read + static unsigned int bioRead, bioPat; // biological read would be contigous and on only 1 of the read + static uint32_t minBcLen, maxBcLen, minUmiLen, maxUmiLen; static bool bioReadFound; + std::string paddingBases = "ACGT"; + unsigned int padLen = paddingBases.length(); // store the matches for both reads boost::smatch match[2]; // required - uint32_t barcodeLength, umiLength, maxValue; + static uint32_t barcodeLength, umiLength; BarcodeEnd end; TagGeometry umi_geo; @@ -258,3 +261,5 @@ namespace alevin{ } #endif + + diff --git a/src/AlevinUtils.cpp b/src/AlevinUtils.cpp index dbdbd6cb8..fdc3c2452 100755 --- a/src/AlevinUtils.cpp +++ b/src/AlevinUtils.cpp @@ -11,6 +11,7 @@ unsigned int alevin::protocols::CustomGeo::bioRead, alevin::protocols::CustomGeo::bioPat; // biological read would be contigous and on only 1 of the read uint32_t alevin::protocols::CustomGeo::minBcLen, alevin::protocols::CustomGeo::maxBcLen; uint32_t alevin::protocols::CustomGeo::minUmiLen, alevin::protocols::CustomGeo::maxUmiLen; + uint32_t alevin::protocols::CustomGeo::barcodeLength, alevin::protocols::CustomGeo::umiLength; bool alevin::protocols::CustomGeo::bioReadFound; namespace alevin { @@ -278,7 +279,10 @@ namespace alevin { for(int i : pt.u[r]) { um += pt.match[r][i]; } - } + } + if(pt.minUmiLen < pt.maxUmiLen) { + addPadding(um, pt.maxUmiLen, pt.paddingBases, pt.padLen); + } umi = um; std::cout << "UMI: "<< umi <())); } + void addPadding(std::string& seq, uint32_t max, std::string padBases, unsigned int padLen) { + int diff = max - seq.length() + 1; // add one base if the length is same to avoid erroneous collisions + int rep = diff / padLen; + int extra = diff % padLen; + for(int i = 0; i < rep; i++){ + seq += padBases; + } + seq += padBases.substr(0,extra); + } + void modifyRegex(size_t readNumber, std::string type, std::string* reg, std::vector *bu, unsigned int& nPat, std::size_t first, std::size_t second) { reg[readNumber-1] += "([ATGC]{" + std::to_string(first) + "," + std::to_string(second) +"})"; @@ -1466,16 +1483,27 @@ namespace alevin { "Exiting now.", customGeo.maxBcLen, maxBC); return false; } + if (customGeo.maxBcLen == customGeo.minBcLen) { + customGeo.barcodeLength = customGeo.maxBcLen; + } else { + customGeo.barcodeLength = customGeo.maxBcLen + 1; + } // if it's OK, set the barcode kmer length - alevin::types::AlevinCellBarcodeKmer::k( static_cast(customGeo.maxBcLen) ); + alevin::types::AlevinCellBarcodeKmer::k( static_cast(customGeo.barcodeLength) ); // the UMI length must be in [1,31] if ((customGeo.minUmiLen < 1) or (customGeo.maxUmiLen > maxUMI)) { aopt.jointLog->error("UMI length ({}) was not in the required length range [1, {}].\n" "Exiting now.", customGeo.maxUmiLen, maxUMI); return false; } + if (customGeo.maxUmiLen == customGeo.minUmiLen) { + customGeo.umiLength = customGeo.maxUmiLen; + } else { + customGeo.umiLength = customGeo.maxUmiLen + 1; + } // if it's OK, set the umi kmer length - alevin::types::AlevinUMIKmer::k( static_cast(customGeo.maxUmiLen) ); + alevin::types::AlevinUMIKmer::k( static_cast(customGeo.umiLength) ); + } //validate specified iupac From 6abd28105103663e9eae5bdba855384db6be30e2 Mon Sep 17 00:00:00 2001 From: Gaurav Date: Thu, 13 Jan 2022 00:52:45 -0500 Subject: [PATCH 07/17] get read sequence using regex --- src/AlevinUtils.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/AlevinUtils.cpp b/src/AlevinUtils.cpp index fdc3c2452..ca3ea0c71 100755 --- a/src/AlevinUtils.cpp +++ b/src/AlevinUtils.cpp @@ -155,15 +155,20 @@ namespace alevin { //subseq = seq2; } template <> - std::string* getReadSequence(apt::CustomGeo& protocol, + std::string* getReadSequence(apt::CustomGeo& pt, std::string& seq, std::string& seq2, std::string& subseq){ - if(protocol.bioRead == 1) { - return &seq; + int r = pt.bioRead - 1; + boost::regex rgx(pt.reg[r]); + subseq.clear(); + std::string read = r == 0 ? seq : seq2; + if(boost::regex_search(read,pt.match[r],rgx)){ + subseq.append(pt.match[r][pt.bioPat]); } else { - return &seq2; + return &subseq; // return empty if regex fails } + return &subseq; // return extracted if it passes } template <> std::string* getReadSequence(apt::Gemcode& protocol, From 9eb148af605c953010a77a1678cfa6fb18b760c1 Mon Sep 17 00:00:00 2001 From: Gaurav Date: Thu, 13 Jan 2022 01:26:34 -0500 Subject: [PATCH 08/17] make customGeo class members extern instead of global variables --- src/AlevinUtils.cpp | 12 ------------ src/SingleCellProtocols.cpp | 13 ++++++++++++- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/src/AlevinUtils.cpp b/src/AlevinUtils.cpp index ca3ea0c71..5192d3af6 100755 --- a/src/AlevinUtils.cpp +++ b/src/AlevinUtils.cpp @@ -1,18 +1,6 @@ #include "AlevinUtils.hpp" #include "peglib.h" #include -// #include - -// store regex for reads 1 and 2 - std::string alevin::protocols::CustomGeo::reg[2]; - // store positions of matches for bc and umi - std::vector alevin::protocols::CustomGeo::b[2], alevin::protocols::CustomGeo::u[2]; - // bioRead stores the read number for biological read and bioPat stores match pattern number on regex - unsigned int alevin::protocols::CustomGeo::bioRead, alevin::protocols::CustomGeo::bioPat; // biological read would be contigous and on only 1 of the read - uint32_t alevin::protocols::CustomGeo::minBcLen, alevin::protocols::CustomGeo::maxBcLen; - uint32_t alevin::protocols::CustomGeo::minUmiLen, alevin::protocols::CustomGeo::maxUmiLen; - uint32_t alevin::protocols::CustomGeo::barcodeLength, alevin::protocols::CustomGeo::umiLength; - bool alevin::protocols::CustomGeo::bioReadFound; namespace alevin { namespace utils { diff --git a/src/SingleCellProtocols.cpp b/src/SingleCellProtocols.cpp index ab15695db..c7b65f5a9 100644 --- a/src/SingleCellProtocols.cpp +++ b/src/SingleCellProtocols.cpp @@ -1,6 +1,5 @@ #include "SingleCellProtocols.hpp" - namespace alevin{ namespace protocols { @@ -28,5 +27,17 @@ namespace alevin{ return os; } + // store regex for reads 1 and 2 + extern std::string CustomGeo::reg[2]; + // store positions of matches for bc and umi + extern std::vector CustomGeo::b[2], CustomGeo::u[2]; + // bioRead stores the read number for biological read and bioPat stores match pattern number on regex + extern unsigned int CustomGeo::bioRead, CustomGeo::bioPat; // biological read would be contigous and on only 1 of the read + extern uint32_t CustomGeo::minBcLen, CustomGeo::maxBcLen; + extern uint32_t CustomGeo::minUmiLen, CustomGeo::maxUmiLen; + extern uint32_t CustomGeo::barcodeLength, CustomGeo::umiLength; + // bool alevin::protocols::CustomGeo::bioReadFound; + extern bool CustomGeo::bioReadFound; + }// protocols }//alevin \ No newline at end of file From d0d56ff4e657d9bc47769934b02c7f9882afe7ab Mon Sep 17 00:00:00 2001 From: Gaurav Date: Thu, 13 Jan 2022 21:53:31 -0500 Subject: [PATCH 09/17] introduce regex checks and avoid logical errors: - read returned only when read regex search is a success - only check reads that have barcode and umi for bc and umi - if regex search fails in bc, return false --- include/SingleCellProtocols.hpp | 2 ++ src/AlevinUtils.cpp | 35 ++++++++++++++++----------------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/include/SingleCellProtocols.hpp b/include/SingleCellProtocols.hpp index 039989fff..907a4bddc 100755 --- a/include/SingleCellProtocols.hpp +++ b/include/SingleCellProtocols.hpp @@ -235,6 +235,8 @@ namespace alevin{ unsigned int padLen = paddingBases.length(); // store the matches for both reads boost::smatch match[2]; + // store the success of regex search + bool rgx_search[2]; // required static uint32_t barcodeLength, umiLength; BarcodeEnd end; diff --git a/src/AlevinUtils.cpp b/src/AlevinUtils.cpp index 5192d3af6..857c35291 100755 --- a/src/AlevinUtils.cpp +++ b/src/AlevinUtils.cpp @@ -148,15 +148,11 @@ namespace alevin { std::string& seq2, std::string& subseq){ int r = pt.bioRead - 1; - boost::regex rgx(pt.reg[r]); subseq.clear(); - std::string read = r == 0 ? seq : seq2; - if(boost::regex_search(read,pt.match[r],rgx)){ + if(pt.rgx_search[r]) { subseq.append(pt.match[r][pt.bioPat]); - } else { - return &subseq; // return empty if regex fails } - return &subseq; // return extracted if it passes + return &subseq; // return extracted if the rgx_search was success, empty otherwise } template <> std::string* getReadSequence(apt::Gemcode& protocol, @@ -269,15 +265,16 @@ namespace alevin { std::string& umi){ std::string um = ""; for(int r=0; r < 2; r++) { - for(int i : pt.u[r]) { - um += pt.match[r][i]; + if(!pt.u[r].empty()) { + for(int i : pt.u[r]) { + um += pt.match[r][i]; + } } } if(pt.minUmiLen < pt.maxUmiLen) { addPadding(um, pt.maxUmiLen, pt.paddingBases, pt.padLen); } umi = um; - std::cout << "UMI: "<< umi < @@ -449,19 +446,22 @@ namespace alevin { std::string barcode=""; for(int r=0; r < 2; r++) { boost::regex rgx(pt.reg[r]); - if(boost::regex_search(read1,pt.match[r],rgx)){ - for(int i : pt.b[r]) { - barcode += pt.match[r][i]; + pt.rgx_search[r] = (r == 0) ? boost::regex_search(read1,pt.match[r],rgx) : + boost::regex_search(read2,pt.match[r],rgx); // using std::string instead of read1/2 results in blank umi. strange! + if(!pt.b[r].empty()) { + if(pt.rgx_search[r]){ + for(int i : pt.b[r]) { + barcode += pt.match[r][i]; + } + } else { + return false; } - } else { - return false; } } if(pt.minBcLen < pt.maxBcLen) { addPadding(barcode, pt.maxBcLen, pt.paddingBases, pt.padLen); } bc = barcode; - std::cout << "BC: "<< barcode < @@ -1458,14 +1458,13 @@ namespace alevin { parser.enable_packrat_parsing(); std::string geometry = vm["custom-geo"].as(); auto val = parser.parse(geometry.c_str()); - std::cout << "val: " << val << std::endl; if(val == 0) { aopt.jointLog->error("Incorrect geometry spec: {} \n" "Exiting now", geometry); return false; } - std::cout << "reg[0]: " << customGeo.reg[0] << std::endl; - std::cout << "reg[1]: " << customGeo.reg[1] << std::endl; + // std::cout << "reg[0]: " << customGeo.reg[0] << std::endl; + // std::cout << "reg[1]: " << customGeo.reg[1] << std::endl; // validate that BC and UMI lengths are OK uint32_t maxBC{31}; From 5a1216797510ab5340a4f2900f1981ccdac3bfb1 Mon Sep 17 00:00:00 2001 From: Gaurav Date: Fri, 14 Jan 2022 08:54:21 -0500 Subject: [PATCH 10/17] add code documentation --- include/SingleCellProtocols.hpp | 2 +- src/AlevinUtils.cpp | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/include/SingleCellProtocols.hpp b/include/SingleCellProtocols.hpp index 907a4bddc..c616b9402 100755 --- a/include/SingleCellProtocols.hpp +++ b/include/SingleCellProtocols.hpp @@ -221,7 +221,7 @@ namespace alevin{ BarcodeEnd end; }; - // Custome geometry specification using regex for extraction + // Custom geometry specification using regex for extraction struct CustomGeo { // store regex for reads 1 and 2 static std::string reg[2]; diff --git a/src/AlevinUtils.cpp b/src/AlevinUtils.cpp index 857c35291..6ce402727 100755 --- a/src/AlevinUtils.cpp +++ b/src/AlevinUtils.cpp @@ -265,9 +265,9 @@ namespace alevin { std::string& umi){ std::string um = ""; for(int r=0; r < 2; r++) { - if(!pt.u[r].empty()) { + if(!pt.u[r].empty()) { // if umi is present on read r for(int i : pt.u[r]) { - um += pt.match[r][i]; + um += pt.match[r][i]; // concat all umi sequences } } } @@ -448,10 +448,10 @@ namespace alevin { boost::regex rgx(pt.reg[r]); pt.rgx_search[r] = (r == 0) ? boost::regex_search(read1,pt.match[r],rgx) : boost::regex_search(read2,pt.match[r],rgx); // using std::string instead of read1/2 results in blank umi. strange! - if(!pt.b[r].empty()) { - if(pt.rgx_search[r]){ + if(!pt.b[r].empty()) { // if read r has barcode + if(pt.rgx_search[r]){ // if rgx search was successful for(int i : pt.b[r]) { - barcode += pt.match[r][i]; + barcode += pt.match[r][i]; // concat all barcode sequences } } else { return false; @@ -1326,6 +1326,14 @@ namespace alevin { alevin::types::AlevinUMIKmer::k( static_cast(umi_geo.length()) ); } else if (have_custom_geo) { // regex based unified barcode geometry parsing + /* Custom Geometry (--custom-geo) should be used when: + * 1. Barcode or umi have variable lengths + * 2. There is known fixed sequence in the reads + * 3. There is some sequence to be excluded + * + * From the input peglib spec it creates a regex. Boost regex lib is used to parse + * the reads. + */ struct apt::CustomGeo customGeo; customGeo.reg[0] = ""; From 52ce33cc252ca2148cfc4ee1bf6fd24f29eff354 Mon Sep 17 00:00:00 2001 From: Gaurav Date: Fri, 14 Jan 2022 12:34:19 -0500 Subject: [PATCH 11/17] make regex static and initliaze once --- include/SingleCellProtocols.hpp | 4 +++- src/AlevinUtils.cpp | 8 ++++---- src/SingleCellProtocols.cpp | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/include/SingleCellProtocols.hpp b/include/SingleCellProtocols.hpp index c616b9402..6079b1167 100755 --- a/include/SingleCellProtocols.hpp +++ b/include/SingleCellProtocols.hpp @@ -223,7 +223,7 @@ namespace alevin{ // Custom geometry specification using regex for extraction struct CustomGeo { - // store regex for reads 1 and 2 + // store regex string for reads 1 and 2 static std::string reg[2]; // store positions of matches for bc and umi static std::vector b[2], u[2]; @@ -237,6 +237,8 @@ namespace alevin{ boost::smatch match[2]; // store the success of regex search bool rgx_search[2]; + // store the regex + static boost::regex rgx[2]; // required static uint32_t barcodeLength, umiLength; BarcodeEnd end; diff --git a/src/AlevinUtils.cpp b/src/AlevinUtils.cpp index 6ce402727..22bcbf4b8 100755 --- a/src/AlevinUtils.cpp +++ b/src/AlevinUtils.cpp @@ -445,9 +445,8 @@ namespace alevin { std::string& bc){ std::string barcode=""; for(int r=0; r < 2; r++) { - boost::regex rgx(pt.reg[r]); - pt.rgx_search[r] = (r == 0) ? boost::regex_search(read1,pt.match[r],rgx) : - boost::regex_search(read2,pt.match[r],rgx); // using std::string instead of read1/2 results in blank umi. strange! + pt.rgx_search[r] = (r == 0) ? boost::regex_search(read1,pt.match[r],pt.rgx[r]) : + boost::regex_search(read2,pt.match[r],pt.rgx[r]); // using std::string instead of read1/2 results in blank umi. strange! if(!pt.b[r].empty()) { // if read r has barcode if(pt.rgx_search[r]){ // if rgx search was successful for(int i : pt.b[r]) { @@ -1473,7 +1472,8 @@ namespace alevin { } // std::cout << "reg[0]: " << customGeo.reg[0] << std::endl; // std::cout << "reg[1]: " << customGeo.reg[1] << std::endl; - + customGeo.rgx[0] = customGeo.reg[0]; + customGeo.rgx[1] = customGeo.reg[1]; // validate that BC and UMI lengths are OK uint32_t maxBC{31}; uint32_t maxUMI{31}; diff --git a/src/SingleCellProtocols.cpp b/src/SingleCellProtocols.cpp index c7b65f5a9..fbf10e343 100644 --- a/src/SingleCellProtocols.cpp +++ b/src/SingleCellProtocols.cpp @@ -38,6 +38,6 @@ namespace alevin{ extern uint32_t CustomGeo::barcodeLength, CustomGeo::umiLength; // bool alevin::protocols::CustomGeo::bioReadFound; extern bool CustomGeo::bioReadFound; - + extern boost::regex CustomGeo::rgx[2]; }// protocols }//alevin \ No newline at end of file From 0019606cfcadea31af556f54be1174d990ee0caf Mon Sep 17 00:00:00 2001 From: Gaurav Date: Mon, 24 Jan 2022 12:51:39 -0500 Subject: [PATCH 12/17] modify data types --- include/SingleCellProtocols.hpp | 8 ++++---- src/AlevinUtils.cpp | 18 ++++++++---------- src/SingleCellProtocols.cpp | 4 ++-- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/include/SingleCellProtocols.hpp b/include/SingleCellProtocols.hpp index 6079b1167..414b615d8 100755 --- a/include/SingleCellProtocols.hpp +++ b/include/SingleCellProtocols.hpp @@ -226,13 +226,13 @@ namespace alevin{ // store regex string for reads 1 and 2 static std::string reg[2]; // store positions of matches for bc and umi - static std::vector b[2], u[2]; + static itlib::small_vector b[2], u[2]; // bioRead stores the read number for biological read and bioPat stores match pattern number on regex - static unsigned int bioRead, bioPat; // biological read would be contigous and on only 1 of the read + static uint32_t bioRead, bioPat; // biological read would be contigous and on only 1 of the read static uint32_t minBcLen, maxBcLen, minUmiLen, maxUmiLen; static bool bioReadFound; - std::string paddingBases = "ACGT"; - unsigned int padLen = paddingBases.length(); + constexpr static std::string paddingBases = "ACGT"; + constexpr static uint32_t padLen = paddingBases.length(); // store the matches for both reads boost::smatch match[2]; // store the success of regex search diff --git a/src/AlevinUtils.cpp b/src/AlevinUtils.cpp index 22bcbf4b8..bff518ff4 100755 --- a/src/AlevinUtils.cpp +++ b/src/AlevinUtils.cpp @@ -617,17 +617,17 @@ namespace alevin { neighbors); } - unsigned int hammingDistance(const std::string s1, const std::string s2) { + uint32_t hammingDistance(const std::string s1, const std::string s2) { if(s1.size() != s2.size()){ throw std::invalid_argument("Strings have different lengths, can't compute hamming distance"); } // compute dot product for all postisions, start with 0 and add if the values are not equal - return std::inner_product(s1.begin(),s1.end(),s2.begin(), 0, std::plus(), + return std::inner_product(s1.begin(),s1.end(),s2.begin(), 0, std::plus(), std::not2(std::equal_to())); } - void addPadding(std::string& seq, uint32_t max, std::string padBases, unsigned int padLen) { + void addPadding(std::string& seq, uint32_t max, std::string padBases, uint32_t padLen) { int diff = max - seq.length() + 1; // add one base if the length is same to avoid erroneous collisions int rep = diff / padLen; int extra = diff % padLen; @@ -637,7 +637,7 @@ namespace alevin { seq += padBases.substr(0,extra); } - void modifyRegex(size_t readNumber, std::string type, std::string* reg, std::vector *bu, unsigned int& nPat, std::size_t first, std::size_t second) + void modifyRegex(size_t readNumber, std::string type, std::string* reg, std::vector *bu, uint32_t& nPat, std::size_t first, std::size_t second) { reg[readNumber-1] += "([ATGC]{" + std::to_string(first) + "," + std::to_string(second) +"})"; if (type == "b"){ @@ -648,7 +648,7 @@ namespace alevin { nPat++; } - void modifyRegex(size_t readNumber, std::string type, std::string* reg, std::vector *bu, unsigned int& nPat, std::size_t len) + void modifyRegex(size_t readNumber, std::string type, std::string* reg, std::vector *bu, uint32_t& nPat, std::size_t len) { reg[readNumber-1] += "([ATGC]{" + std::to_string(len) +"})"; if (type == "b"){ @@ -659,13 +659,13 @@ namespace alevin { nPat++; } - void modifyRegex(size_t readNumber, std::string seq, std::string* reg, unsigned int& nPat) + void modifyRegex(size_t readNumber, std::string seq, std::string* reg, uint32_t& nPat) { reg[readNumber -1] += "(" + seq + ")"; nPat++; } - void modifyRegex(size_t readNumber, std::string* reg, unsigned int& nPat, unsigned int& bioPat) + void modifyRegex(size_t readNumber, std::string* reg, uint32_t& nPat, uint32_t& bioPat) { reg[readNumber - 1] += "([ATGC]{1,})"; bioPat = nPat; @@ -1335,9 +1335,7 @@ namespace alevin { */ struct apt::CustomGeo customGeo; - customGeo.reg[0] = ""; - customGeo.reg[1] = ""; - unsigned int nPatterns = 1; + uint32_t nPatterns = 1; customGeo.minBcLen = 0, customGeo.maxBcLen = 0, customGeo.minUmiLen = 0, customGeo.minUmiLen = 0; customGeo.bioReadFound = false; struct ProtoInfo proto; diff --git a/src/SingleCellProtocols.cpp b/src/SingleCellProtocols.cpp index fbf10e343..18103611c 100644 --- a/src/SingleCellProtocols.cpp +++ b/src/SingleCellProtocols.cpp @@ -30,9 +30,9 @@ namespace alevin{ // store regex for reads 1 and 2 extern std::string CustomGeo::reg[2]; // store positions of matches for bc and umi - extern std::vector CustomGeo::b[2], CustomGeo::u[2]; + extern itlib::small_vector CustomGeo::b[2], CustomGeo::u[2]; // bioRead stores the read number for biological read and bioPat stores match pattern number on regex - extern unsigned int CustomGeo::bioRead, CustomGeo::bioPat; // biological read would be contigous and on only 1 of the read + extern uint32_t CustomGeo::bioRead, CustomGeo::bioPat; // biological read would be contigous and on only 1 of the read extern uint32_t CustomGeo::minBcLen, CustomGeo::maxBcLen; extern uint32_t CustomGeo::minUmiLen, CustomGeo::maxUmiLen; extern uint32_t CustomGeo::barcodeLength, CustomGeo::umiLength; From f46450d49e1fb104eb21257ac6b3a2884758b6f3 Mon Sep 17 00:00:00 2001 From: Gaurav Date: Wed, 26 Jan 2022 12:26:51 -0500 Subject: [PATCH 13/17] update data types and definitions --- include/AlevinUtils.hpp | 10 +++++----- include/SingleCellProtocols.hpp | 12 +++++++----- src/AlevinUtils.cpp | 32 +++++++++++++++++--------------- src/SingleCellProtocols.cpp | 3 ++- 4 files changed, 31 insertions(+), 26 deletions(-) diff --git a/include/AlevinUtils.hpp b/include/AlevinUtils.hpp index dd49b2f6e..a3748d34b 100755 --- a/include/AlevinUtils.hpp +++ b/include/AlevinUtils.hpp @@ -75,7 +75,7 @@ namespace alevin{ unsigned int hammingDistance(const std::string s1, const std::string s2); - void addPadding(std::string& seq, uint32_t max, std::string padBases, unsigned int padLen); + void addPadding(std::string& seq, uint32_t max, const char padBases[], uint32_t padLen); template bool processAlevinOpts(AlevinOpts& aopt, @@ -131,10 +131,10 @@ namespace alevin{ spp::sparse_hash_map& txpToGeneMap, const size_t numGenes); - void modifyRegex(size_t readNumber, std::string seq, std::string* reg, unsigned int& nPat); - void modifyRegex(size_t readNumber, std::string seq, std::string* reg, unsigned int& nPat, unsigned int& bioPat); - void modifyRegex(size_t readNumber, std::string type, std::string* reg, std::vector *bu, unsigned int& nPat, std::size_t len); - void modifyRegex(size_t readNumber, std::string type, std::string* reg, std::vector *bu, unsigned int& nPat, std::size_t first, std::size_t second); + void modifyRegex(size_t readNumber, std::string seq, std::string* reg, uint32_t& nPat); + void modifyRegex(size_t readNumber, std::string seq, std::string* reg, uint32_t& nPat, uint32_t& bioPat); + void modifyRegex(size_t readNumber, char type, std::string* reg, itlib::small_vector *bu, uint32_t& nPat, std::size_t len); + void modifyRegex(size_t readNumber, char type, std::string* reg, itlib::small_vector *bu, uint32_t& nPat, std::size_t first, std::size_t second); } } #endif // __ALEVIN_UTILS_HPP__ diff --git a/include/SingleCellProtocols.hpp b/include/SingleCellProtocols.hpp index 414b615d8..c8828dd75 100755 --- a/include/SingleCellProtocols.hpp +++ b/include/SingleCellProtocols.hpp @@ -8,6 +8,8 @@ #include "AlevinOpts.hpp" #include "AlevinTypes.hpp" #include "pufferfish/itlib/static_vector.hpp" +#include "pufferfish/itlib/small_vector.hpp" + namespace alevin{ namespace protocols { @@ -226,13 +228,14 @@ namespace alevin{ // store regex string for reads 1 and 2 static std::string reg[2]; // store positions of matches for bc and umi - static itlib::small_vector b[2], u[2]; + static itlib::small_vector b[2], u[2]; // bioRead stores the read number for biological read and bioPat stores match pattern number on regex static uint32_t bioRead, bioPat; // biological read would be contigous and on only 1 of the read static uint32_t minBcLen, maxBcLen, minUmiLen, maxUmiLen; static bool bioReadFound; - constexpr static std::string paddingBases = "ACGT"; - constexpr static uint32_t padLen = paddingBases.length(); + static constexpr const char paddingBases[4] = {'A', 'C', 'G', 'T'}; + // static std::string paddingBases = "ACGT"; + static constexpr uint32_t padLen = 4; // store the matches for both reads boost::smatch match[2]; // store the success of regex search @@ -258,8 +261,7 @@ namespace alevin{ struct ProtoInfo { size_t readNumber; - std::string type; - std::string desc; + char type; }; } diff --git a/src/AlevinUtils.cpp b/src/AlevinUtils.cpp index bff518ff4..af650a88a 100755 --- a/src/AlevinUtils.cpp +++ b/src/AlevinUtils.cpp @@ -627,33 +627,35 @@ namespace alevin { std::not2(std::equal_to())); } - void addPadding(std::string& seq, uint32_t max, std::string padBases, uint32_t padLen) { + void addPadding(std::string& seq, uint32_t max, const char padBases[], uint32_t padLen) { int diff = max - seq.length() + 1; // add one base if the length is same to avoid erroneous collisions int rep = diff / padLen; int extra = diff % padLen; for(int i = 0; i < rep; i++){ seq += padBases; } - seq += padBases.substr(0,extra); + for(int i = 0; i < extra; i++){ + seq += padBases[i]; + } } - void modifyRegex(size_t readNumber, std::string type, std::string* reg, std::vector *bu, uint32_t& nPat, std::size_t first, std::size_t second) + void modifyRegex(size_t readNumber, char type, std::string* reg, itlib::small_vector *bu, uint32_t& nPat, std::size_t first, std::size_t second) { reg[readNumber-1] += "([ATGC]{" + std::to_string(first) + "," + std::to_string(second) +"})"; - if (type == "b"){ + if (type == 'b'){ bu[readNumber-1].push_back(nPat); - } else if (type == "u") { + } else if (type == 'u') { bu[readNumber-1].push_back(nPat); - } // else if (type == "x") { } not needed to explicitly mention case 'x' + } // else if (type == 'x') { } not needed to explicitly mention case 'x' nPat++; } - void modifyRegex(size_t readNumber, std::string type, std::string* reg, std::vector *bu, uint32_t& nPat, std::size_t len) + void modifyRegex(size_t readNumber, char type, std::string* reg, itlib::small_vector *bu, uint32_t& nPat, std::size_t len) { reg[readNumber-1] += "([ATGC]{" + std::to_string(len) +"})"; - if (type == "b"){ + if (type == 'b'){ bu[readNumber-1].push_back(nPat); - } else if (type == "u"){ + } else if (type == 'u'){ bu[readNumber-1].push_back(nPat); } nPat++; @@ -1395,13 +1397,13 @@ namespace alevin { parser["Type"] = [&](const peg::SemanticValues &sv) { - auto val = std::string(sv.token()); + auto val = std::string(sv.token())[0]; proto.type = val; }; parser["Fixed"] = [&](const peg::SemanticValues &sv) { - auto val = std::string(sv.token()); + auto val = std::string(sv.token())[0]; proto.type = val; }; @@ -1424,11 +1426,11 @@ namespace alevin { std::cerr << "In length range [X-Y], Y should be > X.\nExiting now" << std::endl; exit(1); } - if (proto.type == "b") { + if (proto.type == 'b') { customGeo.minBcLen += val.first; customGeo.maxBcLen += val.second; modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.b, nPatterns, val.first, val.second); - } else if (proto.type == "u") { + } else if (proto.type == 'u') { customGeo.minUmiLen += val.first; customGeo.maxUmiLen += val.second; modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.u, nPatterns, val.first, val.second); @@ -1440,11 +1442,11 @@ namespace alevin { case 1: { auto val = peg::any_cast(sv[0]); - if (proto.type == "b") { + if (proto.type == 'b') { customGeo.minBcLen += val; // a fixed length bc increases the min length too, not updating can cause logical error when there are 2 pos of bcs customGeo.maxBcLen += val; modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.b, nPatterns, val); - } else if (proto.type == "u"){ + } else if (proto.type == 'u'){ customGeo.minUmiLen += val; customGeo.maxUmiLen += val; modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.u, nPatterns, val); diff --git a/src/SingleCellProtocols.cpp b/src/SingleCellProtocols.cpp index 18103611c..401da176f 100644 --- a/src/SingleCellProtocols.cpp +++ b/src/SingleCellProtocols.cpp @@ -30,12 +30,13 @@ namespace alevin{ // store regex for reads 1 and 2 extern std::string CustomGeo::reg[2]; // store positions of matches for bc and umi - extern itlib::small_vector CustomGeo::b[2], CustomGeo::u[2]; + extern itlib::small_vector CustomGeo::b[2], CustomGeo::u[2]; // bioRead stores the read number for biological read and bioPat stores match pattern number on regex extern uint32_t CustomGeo::bioRead, CustomGeo::bioPat; // biological read would be contigous and on only 1 of the read extern uint32_t CustomGeo::minBcLen, CustomGeo::maxBcLen; extern uint32_t CustomGeo::minUmiLen, CustomGeo::maxUmiLen; extern uint32_t CustomGeo::barcodeLength, CustomGeo::umiLength; + extern constexpr const char CustomGeo::paddingBases[4]; // bool alevin::protocols::CustomGeo::bioReadFound; extern bool CustomGeo::bioReadFound; extern boost::regex CustomGeo::rgx[2]; From 1323debfc293beadb20a0119eb1a639d787266e6 Mon Sep 17 00:00:00 2001 From: Gaurav Date: Wed, 26 Jan 2022 13:37:33 -0500 Subject: [PATCH 14/17] updates: - avoid bc and umi string allocation for each read, add them to CustomGeo - pass hamming distance arguments by ref --- include/AlevinUtils.hpp | 2 +- include/SingleCellProtocols.hpp | 3 ++- src/AlevinUtils.cpp | 18 +++++++++--------- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/include/AlevinUtils.hpp b/include/AlevinUtils.hpp index a3748d34b..91461f4da 100755 --- a/include/AlevinUtils.hpp +++ b/include/AlevinUtils.hpp @@ -73,7 +73,7 @@ namespace alevin{ void readWhitelist(bfs::path& filePath, TrueBcsT& trueBarcodes); - unsigned int hammingDistance(const std::string s1, const std::string s2); + uint32_t hammingDistance(const std::string& s1, const std::string& s2); void addPadding(std::string& seq, uint32_t max, const char padBases[], uint32_t padLen); diff --git a/include/SingleCellProtocols.hpp b/include/SingleCellProtocols.hpp index c8828dd75..0ac4e449b 100755 --- a/include/SingleCellProtocols.hpp +++ b/include/SingleCellProtocols.hpp @@ -234,7 +234,6 @@ namespace alevin{ static uint32_t minBcLen, maxBcLen, minUmiLen, maxUmiLen; static bool bioReadFound; static constexpr const char paddingBases[4] = {'A', 'C', 'G', 'T'}; - // static std::string paddingBases = "ACGT"; static constexpr uint32_t padLen = 4; // store the matches for both reads boost::smatch match[2]; @@ -242,6 +241,8 @@ namespace alevin{ bool rgx_search[2]; // store the regex static boost::regex rgx[2]; + // string for extract barcode and umi + std::string barcode, um; // required static uint32_t barcodeLength, umiLength; BarcodeEnd end; diff --git a/src/AlevinUtils.cpp b/src/AlevinUtils.cpp index af650a88a..c4fbc22f3 100755 --- a/src/AlevinUtils.cpp +++ b/src/AlevinUtils.cpp @@ -263,18 +263,18 @@ namespace alevin { std::string& read2, apt::CustomGeo& pt, std::string& umi){ - std::string um = ""; + pt.um = ""; for(int r=0; r < 2; r++) { if(!pt.u[r].empty()) { // if umi is present on read r for(int i : pt.u[r]) { - um += pt.match[r][i]; // concat all umi sequences + pt.um += pt.match[r][i]; // concat all umi sequences } } } if(pt.minUmiLen < pt.maxUmiLen) { - addPadding(um, pt.maxUmiLen, pt.paddingBases, pt.padLen); + addPadding(pt.um, pt.maxUmiLen, pt.paddingBases, pt.padLen); } - umi = um; + umi = pt.um; return true; } template <> @@ -443,14 +443,14 @@ namespace alevin { std::string& read2, apt::CustomGeo& pt, std::string& bc){ - std::string barcode=""; + pt.barcode=""; for(int r=0; r < 2; r++) { pt.rgx_search[r] = (r == 0) ? boost::regex_search(read1,pt.match[r],pt.rgx[r]) : boost::regex_search(read2,pt.match[r],pt.rgx[r]); // using std::string instead of read1/2 results in blank umi. strange! if(!pt.b[r].empty()) { // if read r has barcode if(pt.rgx_search[r]){ // if rgx search was successful for(int i : pt.b[r]) { - barcode += pt.match[r][i]; // concat all barcode sequences + pt.barcode += pt.match[r][i]; // concat all barcode sequences } } else { return false; @@ -458,9 +458,9 @@ namespace alevin { } } if(pt.minBcLen < pt.maxBcLen) { - addPadding(barcode, pt.maxBcLen, pt.paddingBases, pt.padLen); + addPadding(pt.barcode, pt.maxBcLen, pt.paddingBases, pt.padLen); } - bc = barcode; + bc = pt.barcode; return true; } template <> @@ -617,7 +617,7 @@ namespace alevin { neighbors); } - uint32_t hammingDistance(const std::string s1, const std::string s2) { + uint32_t hammingDistance(const std::string& s1, const std::string& s2) { if(s1.size() != s2.size()){ throw std::invalid_argument("Strings have different lengths, can't compute hamming distance"); } From 7b14dde77228e5a13b6c996a6b3990a4921de6ef Mon Sep 17 00:00:00 2001 From: Gaurav Date: Wed, 26 Jan 2022 14:59:13 -0500 Subject: [PATCH 15/17] make enum class for customGeo readpart type --- include/AlevinUtils.hpp | 4 ++-- include/SingleCellProtocols.hpp | 12 +++++++++++- src/AlevinUtils.cpp | 24 ++++++++++++------------ 3 files changed, 25 insertions(+), 15 deletions(-) diff --git a/include/AlevinUtils.hpp b/include/AlevinUtils.hpp index 91461f4da..43b3a9b2b 100755 --- a/include/AlevinUtils.hpp +++ b/include/AlevinUtils.hpp @@ -133,8 +133,8 @@ namespace alevin{ void modifyRegex(size_t readNumber, std::string seq, std::string* reg, uint32_t& nPat); void modifyRegex(size_t readNumber, std::string seq, std::string* reg, uint32_t& nPat, uint32_t& bioPat); - void modifyRegex(size_t readNumber, char type, std::string* reg, itlib::small_vector *bu, uint32_t& nPat, std::size_t len); - void modifyRegex(size_t readNumber, char type, std::string* reg, itlib::small_vector *bu, uint32_t& nPat, std::size_t first, std::size_t second); + void modifyRegex(size_t readNumber, customReadpartType type, std::string* reg, itlib::small_vector *bu, uint32_t& nPat, std::size_t len); + void modifyRegex(size_t readNumber, customReadpartType type, std::string* reg, itlib::small_vector *bu, uint32_t& nPat, std::size_t first, std::size_t second); } } #endif // __ALEVIN_UTILS_HPP__ diff --git a/include/SingleCellProtocols.hpp b/include/SingleCellProtocols.hpp index 0ac4e449b..e6a2d068a 100755 --- a/include/SingleCellProtocols.hpp +++ b/include/SingleCellProtocols.hpp @@ -259,10 +259,20 @@ namespace alevin{ }; } + + // CustomGeo type of read part + enum class customReadpartType : char { + bc = 'b', + umi = 'u', + fixed = 'f', + exclude = 'x' + }; + + // Store the readNumber and type info for CustomGeo struct ProtoInfo { size_t readNumber; - char type; + customReadpartType type; }; } diff --git a/src/AlevinUtils.cpp b/src/AlevinUtils.cpp index c4fbc22f3..1954609c2 100755 --- a/src/AlevinUtils.cpp +++ b/src/AlevinUtils.cpp @@ -639,23 +639,23 @@ namespace alevin { } } - void modifyRegex(size_t readNumber, char type, std::string* reg, itlib::small_vector *bu, uint32_t& nPat, std::size_t first, std::size_t second) + void modifyRegex(size_t readNumber, customReadpartType type, std::string* reg, itlib::small_vector *bu, uint32_t& nPat, std::size_t first, std::size_t second) { reg[readNumber-1] += "([ATGC]{" + std::to_string(first) + "," + std::to_string(second) +"})"; - if (type == 'b'){ + if (type == customReadpartType::bc){ bu[readNumber-1].push_back(nPat); - } else if (type == 'u') { + } else if (type == customReadpartType::umi) { bu[readNumber-1].push_back(nPat); } // else if (type == 'x') { } not needed to explicitly mention case 'x' nPat++; } - void modifyRegex(size_t readNumber, char type, std::string* reg, itlib::small_vector *bu, uint32_t& nPat, std::size_t len) + void modifyRegex(size_t readNumber, customReadpartType type, std::string* reg, itlib::small_vector *bu, uint32_t& nPat, std::size_t len) { reg[readNumber-1] += "([ATGC]{" + std::to_string(len) +"})"; - if (type == 'b'){ + if (type == customReadpartType::bc){ bu[readNumber-1].push_back(nPat); - } else if (type == 'u'){ + } else if (type == customReadpartType::umi){ bu[readNumber-1].push_back(nPat); } nPat++; @@ -1398,13 +1398,13 @@ namespace alevin { parser["Type"] = [&](const peg::SemanticValues &sv) { auto val = std::string(sv.token())[0]; - proto.type = val; + proto.type = static_cast(val); }; parser["Fixed"] = [&](const peg::SemanticValues &sv) { auto val = std::string(sv.token())[0]; - proto.type = val; + proto.type = static_cast(val); }; parser["Sequence"] = [&](const peg::SemanticValues &sv) @@ -1426,11 +1426,11 @@ namespace alevin { std::cerr << "In length range [X-Y], Y should be > X.\nExiting now" << std::endl; exit(1); } - if (proto.type == 'b') { + if (proto.type == customReadpartType::bc) { customGeo.minBcLen += val.first; customGeo.maxBcLen += val.second; modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.b, nPatterns, val.first, val.second); - } else if (proto.type == 'u') { + } else if (proto.type == customReadpartType::umi) { customGeo.minUmiLen += val.first; customGeo.maxUmiLen += val.second; modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.u, nPatterns, val.first, val.second); @@ -1442,11 +1442,11 @@ namespace alevin { case 1: { auto val = peg::any_cast(sv[0]); - if (proto.type == 'b') { + if (proto.type == customReadpartType::bc) { customGeo.minBcLen += val; // a fixed length bc increases the min length too, not updating can cause logical error when there are 2 pos of bcs customGeo.maxBcLen += val; modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.b, nPatterns, val); - } else if (proto.type == 'u'){ + } else if (proto.type == customReadpartType::umi){ customGeo.minUmiLen += val; customGeo.maxUmiLen += val; modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.u, nPatterns, val); From e85be971470f499ca32ee3b60085d82ae39e0ddd Mon Sep 17 00:00:00 2001 From: Gaurav Date: Thu, 27 Jan 2022 08:09:46 -0500 Subject: [PATCH 16/17] change addPadding approach --- src/AlevinUtils.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/AlevinUtils.cpp b/src/AlevinUtils.cpp index 1954609c2..5ed890196 100755 --- a/src/AlevinUtils.cpp +++ b/src/AlevinUtils.cpp @@ -629,12 +629,11 @@ namespace alevin { void addPadding(std::string& seq, uint32_t max, const char padBases[], uint32_t padLen) { int diff = max - seq.length() + 1; // add one base if the length is same to avoid erroneous collisions - int rep = diff / padLen; - int extra = diff % padLen; - for(int i = 0; i < rep; i++){ - seq += padBases; - } - for(int i = 0; i < extra; i++){ + for(int i = 0; i < diff; i++){ + if(i >= padLen){ + i -= padLen; + diff -= padLen; + } seq += padBases[i]; } } From e9dbddb10ef6a1c3b82821071b7f2d355ecc385c Mon Sep 17 00:00:00 2001 From: Gaurav Date: Thu, 27 Jan 2022 10:28:01 -0500 Subject: [PATCH 17/17] clarify comment for bc and umi vector --- include/SingleCellProtocols.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/SingleCellProtocols.hpp b/include/SingleCellProtocols.hpp index e6a2d068a..0ad7ceda8 100755 --- a/include/SingleCellProtocols.hpp +++ b/include/SingleCellProtocols.hpp @@ -227,7 +227,7 @@ namespace alevin{ struct CustomGeo { // store regex string for reads 1 and 2 static std::string reg[2]; - // store positions of matches for bc and umi + // store group numbers for bc and umi in regex string groups static itlib::small_vector b[2], u[2]; // bioRead stores the read number for biological read and bioPat stores match pattern number on regex static uint32_t bioRead, bioPat; // biological read would be contigous and on only 1 of the read