Skip to content

Reconstruct & evaluate genome scale metabolic models with CarveMe and memote

Francisco Zorrilla edited this page Mar 22, 2021 · 2 revisions

First we extract protein bins from metaWRAP intermediate files with the following:

rule extractProteinBins:
    message:
        "Extract ORF annotated protein fasta files for each bin from reassembly checkm files."
    shell:
        """
        cd {config[path][root]}
        mkdir -p {config[folder][proteinBins]}

        echo -e "Begin moving and renaming ORF annotated protein fasta bins from reassembled_bins/ to protein_bins/ ... \n"
        for folder in reassembled_bins/*/;do 
            echo "Moving bins from sample $(echo $(basename $folder)) ... "
            for bin in $folder*reassembled_bins.checkm/bins/*;do 
                var=$(echo $bin/genes.faa | sed 's|reassembled_bins/||g'|sed 's|/reassembled_bins.checkm/bins||'|sed 's|/genes||g'|sed 's|/|_|g'|sed 's/permissive/p/g'|sed 's/orig/o/g'|sed 's/strict/s/g');
                cp $bin/*.faa {config[path][root]}/{config[folder][proteinBins]}/$var;
            done;
        done
        """

Now we will be expanding the {{binIDs}} wildcard using the filenames of bins in the folder generate above:

binIDs = get_ids_from_path_pattern('protein_bins/*.faa')

Then we use CarveMe to reconstruct genome scale metabolic models as follows:

rule carveme:
    input:
        bin = f'{config["path"]["root"]}/{config["folder"]["proteinBins"]}/{{binIDs}}.faa',
        media = f'{config["path"]["root"]}/{config["folder"]["scripts"]}/{config["scripts"]["carveme"]}'
    output:
        f'{config["path"]["root"]}/{config["folder"]["GEMs"]}/{{binIDs}}.xml'
    benchmark:
        f'{config["path"]["root"]}/benchmarks/{{binIDs}}.carveme.benchmark.txt'
    message:
        """
        Make sure that the input files are ORF annotated and preferably protein fasta.
        If given raw fasta files, Carveme will run without errors but each contig will be treated as one gene.
        """
    shell:
        """
        echo "Activating {config[envs][metagem]} conda environment ... "
        set +u;source activate {config[envs][metagem]};set -u
        
        mkdir -p $(dirname {output})
        mkdir -p logs

        cp {input.bin} {input.media} $SCRATCHDIR
        cd $SCRATCHDIR
        
        echo "Begin carving GEM ... "
        carve -g {config[params][carveMedia]} \
            -v \
            --mediadb $(basename {input.media}) \
            --fbc2 \
            -o $(echo $(basename {input.bin}) | sed 's/.faa/.xml/g') $(basename {input.bin})
        
        echo "Done carving GEM. "
        [ -f *.xml ] && mv *.xml $(dirname {output})
        """

Since it is likely that some bins may fail to generate a metabolic model due to bad quality, we switch to the wildcard expansion of {{gemIDs}} to only take bin IDs of models that were succesfully generated:

gemIDs = get_ids_from_path_pattern('GEMs/*.xml')

Now we can quality control our models using MEMOTE:

rule memote:
    input:
        f'{config["path"]["root"]}/{config["folder"]["GEMs"]}/{{gemIDs}}.xml'
    output:
        directory(f'{config["path"]["root"]}/{config["folder"]["memote"]}/{{gemIDs}}')
    benchmark:
        f'{config["path"]["root"]}/benchmarks/{{gemIDs}}.memote.benchmark.txt'
    shell:
        """
        set +u;source activate {config[envs][metabagpipes]};set -u
        module load git
        mkdir -p {output}
        cp {input} $SCRATCHDIR
        cd $SCRATCHDIR
        memote report snapshot --skip test_find_metabolites_produced_with_closed_bounds --skip test_find_metabolites_consumed_with_closed_bounds --skip test_find_metabolites_not_produced_with_open_bounds --skip test_find_metabolites_not_consumed_with_open_bounds --skip test_find_incorrect_thermodynamic_reversibility --filename $(echo $(basename {input})|sed 's/.xml/.html/') *.xml
        memote run --skip test_find_metabolites_produced_with_closed_bounds --skip test_find_metabolites_consumed_with_closed_bounds --skip test_find_metabolites_not_produced_with_open_bounds --skip test_find_metabolites_not_consumed_with_open_bounds --skip test_find_incorrect_thermodynamic_reversibility *.xml
        mv result.json.gz $(echo $(basename {input})|sed 's/.xml/.json.gz/')
        mv *.gz *.html {output}
        """