diff --git a/CMakeLists.txt b/CMakeLists.txt old mode 100644 new mode 100755 index cf2061e7a..b57fc3f7c --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -430,7 +430,7 @@ endif() ## set(Boost_ADDITIONAL_VERSIONS "1.59.0" "1.60.0" "1.61.0" "1.62.0" "1.63.0" "1.64.0" "1.65.0" "1.66.0" "1.67.0" "1.68.0" "1.69.0" "1.70.0" "1.71.0" "1.72.0" "1.73.0" "1.74.0" "1.75.0" "1.76.0" "1.77.0" "1.78.0") if (NOT BOOST_RECONFIGURE) -find_package(Boost 1.59.0 COMPONENTS iostreams system filesystem timer chrono program_options) +find_package(Boost 1.59.0 COMPONENTS iostreams filesystem system timer chrono program_options regex) message("BOOST_INCLUDEDIR = ${BOOST_INCLUDEDIR}") message("BOOST_LIBRARYDIR = ${BOOST_LIBRARYDIR}") message("Boost_FOUND = ${Boost_FOUND}") @@ -459,7 +459,7 @@ if(BOOST_RECONFIGURE) set(CMAKE_PREFIX_PATH ${CMAKE_CURRENT_SOURCE_DIR}/external/install) set(Boost_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/external/install/include) set(Boost_LIBRARY_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/external/install/lib) - find_package(Boost 1.59.0 COMPONENTS iostreams system filesystem timer chrono program_options locale REQUIRED) + find_package(Boost 1.59.0 COMPONENTS iostreams filesystem system timer chrono program_options locale regex REQUIRED) set(FETCH_BOOST FALSE) endif() @@ -484,7 +484,7 @@ elseif(FETCH_BOOST) ## Let the rest of the build process know we're going to be fetching boost set(BOOST_LIB_SUBSET --with-iostreams --with-atomic --with-chrono --with-container --with-date_time --with-exception --with-filesystem --with-graph --with-graph_parallel --with-math - --with-program_options --with-system --with-locale + --with-program_options --with-system --with-locale --with-regex --with-timer) set(BOOST_WILL_RECONFIGURE TRUE) set(FETCH_BOOST FALSE) diff --git a/include/AlevinUtils.hpp b/include/AlevinUtils.hpp old mode 100644 new mode 100755 index f191d594c..43b3a9b2b --- a/include/AlevinUtils.hpp +++ b/include/AlevinUtils.hpp @@ -73,7 +73,9 @@ namespace alevin{ void readWhitelist(bfs::path& filePath, TrueBcsT& trueBarcodes); - unsigned int hammingDistance(const std::string s1, const std::string s2); + uint32_t hammingDistance(const std::string& s1, const std::string& s2); + + void addPadding(std::string& seq, uint32_t max, const char padBases[], uint32_t padLen); template bool processAlevinOpts(AlevinOpts& aopt, @@ -128,6 +130,11 @@ namespace alevin{ bool hasOneGene(const std::vector& txps, uint32_t& geneId, spp::sparse_hash_map& txpToGeneMap, const size_t numGenes); + + void modifyRegex(size_t readNumber, std::string seq, std::string* reg, uint32_t& nPat); + void modifyRegex(size_t readNumber, std::string seq, std::string* reg, uint32_t& nPat, uint32_t& bioPat); + void modifyRegex(size_t readNumber, customReadpartType type, std::string* reg, itlib::small_vector *bu, uint32_t& nPat, std::size_t len); + void modifyRegex(size_t readNumber, customReadpartType type, std::string* reg, itlib::small_vector *bu, uint32_t& nPat, std::size_t first, std::size_t second); } } #endif // __ALEVIN_UTILS_HPP__ diff --git a/include/SingleCellProtocols.hpp b/include/SingleCellProtocols.hpp old mode 100644 new mode 100755 index 1db3592dc..0ad7ceda8 --- a/include/SingleCellProtocols.hpp +++ b/include/SingleCellProtocols.hpp @@ -3,10 +3,13 @@ #include #include +#include #include "AlevinOpts.hpp" #include "AlevinTypes.hpp" #include "pufferfish/itlib/static_vector.hpp" +#include "pufferfish/itlib/small_vector.hpp" + namespace alevin{ namespace protocols { @@ -220,7 +223,60 @@ namespace alevin{ BarcodeEnd end; }; + // Custom geometry specification using regex for extraction + struct CustomGeo { + // store regex string for reads 1 and 2 + static std::string reg[2]; + // store group numbers for bc and umi in regex string groups + static itlib::small_vector b[2], u[2]; + // bioRead stores the read number for biological read and bioPat stores match pattern number on regex + static uint32_t bioRead, bioPat; // biological read would be contigous and on only 1 of the read + static uint32_t minBcLen, maxBcLen, minUmiLen, maxUmiLen; + static bool bioReadFound; + static constexpr const char paddingBases[4] = {'A', 'C', 'G', 'T'}; + static constexpr uint32_t padLen = 4; + // store the matches for both reads + boost::smatch match[2]; + // store the success of regex search + bool rgx_search[2]; + // store the regex + static boost::regex rgx[2]; + // string for extract barcode and umi + std::string barcode, um; + // required + static uint32_t barcodeLength, umiLength; + BarcodeEnd end; + + TagGeometry umi_geo; + TagGeometry bc_geo; + TagGeometry read_geo; + + void set_umi_geo(TagGeometry& g) { umi_geo = g; umiLength = umi_geo.length(); } + void set_bc_geo(TagGeometry& g) { bc_geo = g; barcodeLength = bc_geo.length(); } + void set_read_geo(TagGeometry& g) { read_geo = g; } + uint32_t barcode_length() const { return barcodeLength; } + uint32_t umi_length() const { return umiLength; } + }; + } + + // CustomGeo type of read part + enum class customReadpartType : char { + bc = 'b', + umi = 'u', + fixed = 'f', + exclude = 'x' + }; + + // Store the readNumber and type info for CustomGeo + struct ProtoInfo + { + size_t readNumber; + customReadpartType type; + }; + } #endif + + diff --git a/src/Alevin.cpp b/src/Alevin.cpp index 8246b7263..a4724994a 100644 --- a/src/Alevin.cpp +++ b/src/Alevin.cpp @@ -1039,6 +1039,7 @@ salmon-based processing of single-cell RNA-seq data. bool custom_new = (vm.count("bc-geometry") and vm.count("umi-geometry")); bool custom = custom_old or custom_new; + bool custom_geo = vm.count("custom-geo"); uint8_t validate_num_protocols {0}; if (dropseq) validate_num_protocols += 1; @@ -1054,6 +1055,7 @@ salmon-based processing of single-cell RNA-seq data. if (quartzseq2) validate_num_protocols += 1; if (sciseq3) validate_num_protocols += 1; if (custom) validate_num_protocols += 1; + if (custom_geo) validate_num_protocols += 1; if ( validate_num_protocols != 1 ) { fmt::print(stderr, "ERROR: Please specify one and only one scRNA protocol;"); @@ -1189,6 +1191,12 @@ salmon-based processing of single-cell RNA-seq data. initiatePipeline(aopt, sopt, orderedOptions, vm, commentString, noTgMap, barcodeFiles, readFiles, salmonIndex); + } else if (custom_geo) { + AlevinOpts aopt; + //aopt.jointLog->warn("Using Custom Setting for Alevin"); + initiatePipeline(aopt, sopt, orderedOptions, + vm, commentString, noTgMap, + barcodeFiles, readFiles, salmonIndex); } } catch (po::error& e) { diff --git a/src/AlevinHash.cpp b/src/AlevinHash.cpp index 8bddb0636..c7f4cc676 100644 --- a/src/AlevinHash.cpp +++ b/src/AlevinHash.cpp @@ -331,3 +331,7 @@ int salmonHashQuantify(AlevinOpts& aopt, bfs::path& outputDirectory, CFreqMapT& freqCounter); +template +int salmonHashQuantify(AlevinOpts& aopt, + bfs::path& outputDirectory, + CFreqMapT& freqCounter); diff --git a/src/AlevinUtils.cpp b/src/AlevinUtils.cpp old mode 100644 new mode 100755 index a5bad4a9f..5ed890196 --- a/src/AlevinUtils.cpp +++ b/src/AlevinUtils.cpp @@ -1,5 +1,6 @@ #include "AlevinUtils.hpp" #include "peglib.h" +#include namespace alevin { namespace utils { @@ -142,6 +143,18 @@ namespace alevin { //subseq = seq2; } template <> + std::string* getReadSequence(apt::CustomGeo& pt, + std::string& seq, + std::string& seq2, + std::string& subseq){ + int r = pt.bioRead - 1; + subseq.clear(); + if(pt.rgx_search[r]) { + subseq.append(pt.match[r][pt.bioPat]); + } + return &subseq; // return extracted if the rgx_search was success, empty otherwise + } + template <> std::string* getReadSequence(apt::Gemcode& protocol, std::string& seq, std::string& seq2, @@ -246,6 +259,25 @@ namespace alevin { return pt.umi_geo.extract_tag(read1, read2, umi); } template <> + bool extractUMI(std::string& read1, + std::string& read2, + apt::CustomGeo& pt, + std::string& umi){ + pt.um = ""; + for(int r=0; r < 2; r++) { + if(!pt.u[r].empty()) { // if umi is present on read r + for(int i : pt.u[r]) { + pt.um += pt.match[r][i]; // concat all umi sequences + } + } + } + if(pt.minUmiLen < pt.maxUmiLen) { + addPadding(pt.um, pt.maxUmiLen, pt.paddingBases, pt.padLen); + } + umi = pt.um; + return true; + } + template <> bool extractUMI(std::string& read, std::string& read2, apt::QuartzSeq2& pt, @@ -407,6 +439,31 @@ namespace alevin { return pt.bc_geo.extract_tag(read1, read2, bc); } template <> + bool extractBarcode(std::string& read1, + std::string& read2, + apt::CustomGeo& pt, + std::string& bc){ + pt.barcode=""; + for(int r=0; r < 2; r++) { + pt.rgx_search[r] = (r == 0) ? boost::regex_search(read1,pt.match[r],pt.rgx[r]) : + boost::regex_search(read2,pt.match[r],pt.rgx[r]); // using std::string instead of read1/2 results in blank umi. strange! + if(!pt.b[r].empty()) { // if read r has barcode + if(pt.rgx_search[r]){ // if rgx search was successful + for(int i : pt.b[r]) { + pt.barcode += pt.match[r][i]; // concat all barcode sequences + } + } else { + return false; + } + } + } + if(pt.minBcLen < pt.maxBcLen) { + addPadding(pt.barcode, pt.maxBcLen, pt.paddingBases, pt.padLen); + } + bc = pt.barcode; + return true; + } + template <> bool extractBarcode(std::string& read, std::string& read2, apt::QuartzSeq2& pt, @@ -560,16 +617,62 @@ namespace alevin { neighbors); } - unsigned int hammingDistance(const std::string s1, const std::string s2){ + uint32_t hammingDistance(const std::string& s1, const std::string& s2) { if(s1.size() != s2.size()){ throw std::invalid_argument("Strings have different lengths, can't compute hamming distance"); } // compute dot product for all postisions, start with 0 and add if the values are not equal - return std::inner_product(s1.begin(),s1.end(),s2.begin(), 0, std::plus(), + return std::inner_product(s1.begin(),s1.end(),s2.begin(), 0, std::plus(), std::not2(std::equal_to())); } + void addPadding(std::string& seq, uint32_t max, const char padBases[], uint32_t padLen) { + int diff = max - seq.length() + 1; // add one base if the length is same to avoid erroneous collisions + for(int i = 0; i < diff; i++){ + if(i >= padLen){ + i -= padLen; + diff -= padLen; + } + seq += padBases[i]; + } + } + + void modifyRegex(size_t readNumber, customReadpartType type, std::string* reg, itlib::small_vector *bu, uint32_t& nPat, std::size_t first, std::size_t second) + { + reg[readNumber-1] += "([ATGC]{" + std::to_string(first) + "," + std::to_string(second) +"})"; + if (type == customReadpartType::bc){ + bu[readNumber-1].push_back(nPat); + } else if (type == customReadpartType::umi) { + bu[readNumber-1].push_back(nPat); + } // else if (type == 'x') { } not needed to explicitly mention case 'x' + nPat++; + } + + void modifyRegex(size_t readNumber, customReadpartType type, std::string* reg, itlib::small_vector *bu, uint32_t& nPat, std::size_t len) + { + reg[readNumber-1] += "([ATGC]{" + std::to_string(len) +"})"; + if (type == customReadpartType::bc){ + bu[readNumber-1].push_back(nPat); + } else if (type == customReadpartType::umi){ + bu[readNumber-1].push_back(nPat); + } + nPat++; + } + + void modifyRegex(size_t readNumber, std::string seq, std::string* reg, uint32_t& nPat) + { + reg[readNumber -1] += "(" + seq + ")"; + nPat++; + } + + void modifyRegex(size_t readNumber, std::string* reg, uint32_t& nPat, uint32_t& bioPat) + { + reg[readNumber - 1] += "([ATGC]{1,})"; + bioPat = nPat; + nPat++; + } + void getTxpToGeneMap(spp::sparse_hash_map& txpToGeneMap, spp::sparse_hash_map& geneIdxMap, const std::string& t2gFileName, @@ -1008,6 +1111,7 @@ namespace alevin { bool have_custom_umi_geo = vm.count("umi-geometry"); bool have_custom_bc_geo = vm.count("bc-geometry"); bool have_custom_read_geo = vm.count("read-geometry"); + bool have_custom_geo = vm.count("custom-geo"); // need both bool have_any_custom_geo = have_custom_read_geo or have_custom_umi_geo or have_custom_bc_geo; bool have_all_custom_geo = have_custom_read_geo and have_custom_umi_geo and have_custom_bc_geo; @@ -1221,7 +1325,185 @@ namespace alevin { // if it's OK, set the umi kmer length alevin::types::AlevinUMIKmer::k( static_cast(umi_geo.length()) ); - } // new custom barcode geometry + } else if (have_custom_geo) { // regex based unified barcode geometry parsing + /* Custom Geometry (--custom-geo) should be used when: + * 1. Barcode or umi have variable lengths + * 2. There is known fixed sequence in the reads + * 3. There is some sequence to be excluded + * + * From the input peglib spec it creates a regex. Boost regex lib is used to parse + * the reads. + */ + + struct apt::CustomGeo customGeo; + uint32_t nPatterns = 1; + customGeo.minBcLen = 0, customGeo.maxBcLen = 0, customGeo.minUmiLen = 0, customGeo.minUmiLen = 0; + customGeo.bioReadFound = false; + struct ProtoInfo proto; + peg::parser parser(R"( + Specification <- ReadNumber1'{'Description{1,10}'}'ReadNumber2'{'Description{1,10}'}' + ReadNumber1 <- [1] + ReadNumber2 <- [2] + Description <- Type'['Lengths']' / Fixed'['Sequence']' / Read + Type <- 'b' / 'u' / 'x' + Fixed <- 'f' + Read <- 'r' + Sequence <- [ATGC]+ + Lengths <- (Length '-' Length) / Length + Length <- [0-9]+ + )"); + + + parser["Length"] = [](const peg::SemanticValues &sv) + { + auto val = static_cast(std::stoull(sv.token())); + if(val<=0){ + std::cerr << "Lengths should be > 0. Exiting." << std::endl; + exit(1); + }; + return val; + }; + + + parser["ReadNumber1"] = [&](const peg::SemanticValues &sv) + { + auto val = std::stoi(sv.token()); + proto.readNumber = val; + }; + + parser["ReadNumber2"] = [&](const peg::SemanticValues &sv) + { + auto val = std::stoi(sv.token()); + proto.readNumber = val; + nPatterns = 1; + }; + + parser["Description"] = [&](const peg::SemanticValues &sv) + { + auto val = std::string(sv.token()); + if(val == "r") { + if (!customGeo.bioReadFound){ + customGeo.bioRead = proto.readNumber; + customGeo.bioReadFound = true; + } else { + // aopt.jointLog->error("Only contigous biological read expected.\nExiting now."); + std::cerr << "Only contigous biological read expected.\nExiting now." << std::endl; + exit(1); + } + modifyRegex(proto.readNumber, customGeo.reg, nPatterns, customGeo.bioPat); + } + }; + + parser["Type"] = [&](const peg::SemanticValues &sv) + { + auto val = std::string(sv.token())[0]; + proto.type = static_cast(val); + }; + + parser["Fixed"] = [&](const peg::SemanticValues &sv) + { + auto val = std::string(sv.token())[0]; + proto.type = static_cast(val); + }; + + parser["Sequence"] = [&](const peg::SemanticValues &sv) + { + auto val = std::string(sv.token()); + modifyRegex(proto.readNumber, val, customGeo.reg, nPatterns); + }; + + parser["Lengths"] = [&](const peg::SemanticValues &sv) + { + switch (sv.choice()) + { + case 0: + { + auto val = std::make_pair( + peg::any_cast(sv[0]), peg::any_cast(sv[1])); + if(val.second <= val.first) { + // aopt.jointLog->error("In length range [X-Y], Y should be > X.\nExiting now"); + std::cerr << "In length range [X-Y], Y should be > X.\nExiting now" << std::endl; + exit(1); + } + if (proto.type == customReadpartType::bc) { + customGeo.minBcLen += val.first; + customGeo.maxBcLen += val.second; + modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.b, nPatterns, val.first, val.second); + } else if (proto.type == customReadpartType::umi) { + customGeo.minUmiLen += val.first; + customGeo.maxUmiLen += val.second; + modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.u, nPatterns, val.first, val.second); + } else { + modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.u, nPatterns, val.first, val.second); // case 'x' + } + } + break; + case 1: + { + auto val = peg::any_cast(sv[0]); + if (proto.type == customReadpartType::bc) { + customGeo.minBcLen += val; // a fixed length bc increases the min length too, not updating can cause logical error when there are 2 pos of bcs + customGeo.maxBcLen += val; + modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.b, nPatterns, val); + } else if (proto.type == customReadpartType::umi){ + customGeo.minUmiLen += val; + customGeo.maxUmiLen += val; + modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.u, nPatterns, val); + } else { + modifyRegex(proto.readNumber, proto.type, customGeo.reg, customGeo.u, nPatterns, val); // case 'x' + } + } + break; + } + }; + + // NOTE: this is just for backwards-compatibility. + // The barcode end is redundant with the new geometry + // specification. + aopt.protocol.end = BarcodeEnd::FIVE; + parser.enable_packrat_parsing(); + std::string geometry = vm["custom-geo"].as(); + auto val = parser.parse(geometry.c_str()); + if(val == 0) { + aopt.jointLog->error("Incorrect geometry spec: {} \n" + "Exiting now", geometry); + return false; + } + // std::cout << "reg[0]: " << customGeo.reg[0] << std::endl; + // std::cout << "reg[1]: " << customGeo.reg[1] << std::endl; + customGeo.rgx[0] = customGeo.reg[0]; + customGeo.rgx[1] = customGeo.reg[1]; + // validate that BC and UMI lengths are OK + uint32_t maxBC{31}; + uint32_t maxUMI{31}; + // the barcode length must be in [1,31] + if ((customGeo.minBcLen < 1) or (customGeo.maxBcLen > maxBC)) { + aopt.jointLog->error("Barcode length ({}) was not in the required length range [1, {}].\n" + "Exiting now.", customGeo.maxBcLen, maxBC); + return false; + } + if (customGeo.maxBcLen == customGeo.minBcLen) { + customGeo.barcodeLength = customGeo.maxBcLen; + } else { + customGeo.barcodeLength = customGeo.maxBcLen + 1; + } + // if it's OK, set the barcode kmer length + alevin::types::AlevinCellBarcodeKmer::k( static_cast(customGeo.barcodeLength) ); + // the UMI length must be in [1,31] + if ((customGeo.minUmiLen < 1) or (customGeo.maxUmiLen > maxUMI)) { + aopt.jointLog->error("UMI length ({}) was not in the required length range [1, {}].\n" + "Exiting now.", customGeo.maxUmiLen, maxUMI); + return false; + } + if (customGeo.maxUmiLen == customGeo.minUmiLen) { + customGeo.umiLength = customGeo.maxUmiLen; + } else { + customGeo.umiLength = customGeo.maxUmiLen + 1; + } + // if it's OK, set the umi kmer length + alevin::types::AlevinUMIKmer::k( static_cast(customGeo.umiLength) ); + + } //validate specified iupac if (aopt.iupac.size()>0){ @@ -1431,6 +1713,10 @@ namespace alevin { SalmonOpts& sopt, bool noTgMap, boost::program_options::variables_map& vm); template + bool processAlevinOpts(AlevinOpts& aopt, + SalmonOpts& sopt, bool noTgMap, + boost::program_options::variables_map& vm); + template bool processAlevinOpts(AlevinOpts& aopt, SalmonOpts& sopt, bool noTgMap, boost::program_options::variables_map& vm); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt old mode 100644 new mode 100755 index 6ac84212e..cfe066e2c --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -234,6 +234,7 @@ target_link_libraries(salmon graphdump ntcard gff + alevin_core ${Boost_LIBRARIES} ${ICU_LIBS} ${STADEN_LIBRARIES} ${CURL_LIBRARIES} @@ -244,7 +245,7 @@ target_link_libraries(salmon ${LIBSALMON_LINKER_FLAGS} ${NON_APPLECLANG_LIBS} ksw2pp - alevin_core +## PUFF_INTEGRATION ${ASAN_LIB} ${FAST_MALLOC_LIB} TBB::tbb diff --git a/src/CollapsedCellOptimizer.cpp b/src/CollapsedCellOptimizer.cpp index 5c868864e..bb0b0c90b 100644 --- a/src/CollapsedCellOptimizer.cpp +++ b/src/CollapsedCellOptimizer.cpp @@ -1537,6 +1537,16 @@ bool CollapsedCellOptimizer::optimize(EqMapT& fullEqMap, size_t numLowConfidentBarcode); template +bool CollapsedCellOptimizer::optimize(EqMapT& fullEqMap, + spp::sparse_hash_map& txpToGeneMap, + spp::sparse_hash_map& geneIdxMap, + AlevinOpts& aopt, + GZipWriter& gzw, + std::vector& trueBarcodes, + std::vector& umiCount, + CFreqMapT& freqCounter, + size_t numLowConfidentBarcode); +template bool CollapsedCellOptimizer::optimize(EqMapT& fullEqMap, spp::sparse_hash_map& txpToGeneMap, spp::sparse_hash_map& geneIdxMap, diff --git a/src/GZipWriter.cpp b/src/GZipWriter.cpp index 0fe3e4ff2..c05bf0fa9 100644 --- a/src/GZipWriter.cpp +++ b/src/GZipWriter.cpp @@ -1912,6 +1912,10 @@ template bool GZipWriter::writeEquivCounts( const AlevinOpts& aopts, SCExpT& readExp); +template +bool GZipWriter::writeEquivCounts( + const AlevinOpts& aopts, + SCExpT& readExp); template bool GZipWriter::writeMetaAlevin(const AlevinOpts& opts, boost::filesystem::path aux_dir); @@ -1956,4 +1960,8 @@ template bool GZipWriter::writeMetaAlevin(const AlevinOpts& opts, boost::filesystem::path aux_dir); +template bool +GZipWriter::writeMetaAlevin(const AlevinOpts& opts, + boost::filesystem::path aux_dir); + diff --git a/src/ProgramOptionsGenerator.cpp b/src/ProgramOptionsGenerator.cpp old mode 100644 new mode 100755 index e7f4da381..10150db82 --- a/src/ProgramOptionsGenerator.cpp +++ b/src/ProgramOptionsGenerator.cpp @@ -467,6 +467,9 @@ namespace salmon { ("umi-geometry", po::value(), "format string describing the genometry of the umi" ) + ("custom-geo", po::value(), + "unified custom geometry" + ) ( "end",po::value(), "Cell-Barcodes end (5 or 3) location in the read sequence from where barcode has to" diff --git a/src/SalmonAlevin.cpp b/src/SalmonAlevin.cpp index e5f45d988..ab03ab2a3 100644 --- a/src/SalmonAlevin.cpp +++ b/src/SalmonAlevin.cpp @@ -3221,3 +3221,17 @@ alevinQuant(AlevinOpts& aopt, SalmonOpts& sopt, boost::program_options::parsed_options& orderedOptions, CFreqMapT& freqCounter, size_t numLowConfidentBarcode, std::unique_ptr& salmonIndex); + +template int +alevin_sc_align(AlevinOpts& aopt, SalmonOpts& sopt, + boost::program_options::parsed_options& orderedOptions, + std::unique_ptr& salmonIndex); + +template int +alevinQuant(AlevinOpts& aopt, SalmonOpts& sopt, + SoftMapT& barcodeMap, TrueBcsT& trueBarcodes, + spp::sparse_hash_map& txpToGeneMap, + spp::sparse_hash_map& geneIdxMap, + boost::program_options::parsed_options& orderedOptions, + CFreqMapT& freqCounter, size_t numLowConfidentBarcode, + std::unique_ptr& salmonIndex); \ No newline at end of file diff --git a/src/SingleCellProtocols.cpp b/src/SingleCellProtocols.cpp index ab15695db..401da176f 100644 --- a/src/SingleCellProtocols.cpp +++ b/src/SingleCellProtocols.cpp @@ -1,6 +1,5 @@ #include "SingleCellProtocols.hpp" - namespace alevin{ namespace protocols { @@ -28,5 +27,18 @@ namespace alevin{ return os; } + // store regex for reads 1 and 2 + extern std::string CustomGeo::reg[2]; + // store positions of matches for bc and umi + extern itlib::small_vector CustomGeo::b[2], CustomGeo::u[2]; + // bioRead stores the read number for biological read and bioPat stores match pattern number on regex + extern uint32_t CustomGeo::bioRead, CustomGeo::bioPat; // biological read would be contigous and on only 1 of the read + extern uint32_t CustomGeo::minBcLen, CustomGeo::maxBcLen; + extern uint32_t CustomGeo::minUmiLen, CustomGeo::maxUmiLen; + extern uint32_t CustomGeo::barcodeLength, CustomGeo::umiLength; + extern constexpr const char CustomGeo::paddingBases[4]; + // bool alevin::protocols::CustomGeo::bioReadFound; + extern bool CustomGeo::bioReadFound; + extern boost::regex CustomGeo::rgx[2]; }// protocols }//alevin \ No newline at end of file diff --git a/src/WhiteList.cpp b/src/WhiteList.cpp index fd6559811..57ddc19c7 100644 --- a/src/WhiteList.cpp +++ b/src/WhiteList.cpp @@ -308,5 +308,9 @@ namespace alevin { std::vector& trueBarcodes, bool useRibo, bool useMito, size_t numLowConfidentBarcode); + template bool performWhitelisting(AlevinOpts& aopt, + std::vector& trueBarcodes, + bool useRibo, bool useMito, + size_t numLowConfidentBarcode); } }