Skip to content

Commit

Permalink
Added option --alphabet to all goalign commands
Browse files Browse the repository at this point in the history
  • Loading branch information
fredericlemoine committed Nov 14, 2023
1 parent 3d96af9 commit 634d0ea
Show file tree
Hide file tree
Showing 9 changed files with 317 additions and 111 deletions.
4 changes: 2 additions & 2 deletions align/align.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,9 +196,9 @@ func NewAlign(alphabet int) *align {
// If the alphabet name is not known, returns align.UNKNOWN
func AlphabetFromString(alphabet string) int {
switch strings.ToLower(alphabet) {
case "dna", "rna", "nucleotide":
case "dna", "rna", "nucleotide", "nt":
return NUCLEOTIDS
case "protein":
case "protein", "aa":
return AMINOACIDS
default:
return UNKNOWN
Expand Down
59 changes: 57 additions & 2 deletions align/seqbag.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@ type SeqBag interface {
AddSequenceChar(name string, sequence []uint8, comment string) error
AppendSeqIdentifier(identifier string, right bool)
Alphabet() int
SetAlphabet(int) error // Sets the alphabet
AlphabetStr() string
AlphabetCharacters() []uint8
AlphabetCharToIndex(c uint8) int // Returns index of the character (nt or aa) in the AlphabetCharacters() array
AutoAlphabet() // detects and sets alphabet automatically for all the sequences
DetectAlphabet() (alphabet int) // detects the compatible alphabets
CharStats() map[uint8]int64
UniqueCharacters() []uint8
CharStatsSeq(idx int) (map[uint8]int, error) // Computes frequency of characters for the given sequence
Expand Down Expand Up @@ -194,6 +196,38 @@ func (sb *seqbag) Alphabet() int {
return sb.alphabet
}

// SetAlphabet sets the alphabet of the current sequences
// alphabet can be align.AMINOACIDS or align.NUCLEOTIDS
// otherwise returns an error
func (sb *seqbag) SetAlphabet(alphabet int) (err error) {
alpha := sb.DetectAlphabet()
if alpha == UNKNOWN {
err = fmt.Errorf("input sequence alphabet unknown")
return
}

if alphabet == NUCLEOTIDS {
if alpha == NUCLEOTIDS || alpha == BOTH {
sb.alphabet = NUCLEOTIDS
} else {
err = fmt.Errorf("given alphabet is not compatible with input sequences")
return
}
} else if alphabet == AMINOACIDS {
if alpha == AMINOACIDS || alpha == BOTH {
sb.alphabet = AMINOACIDS
} else {
err = fmt.Errorf("given alphabet is not compatible with input sequences")
return
}
} else {
err = fmt.Errorf("given alphabet can not be used in alignment")
return
}

return
}

func (sb *seqbag) AlphabetStr() string {
switch sb.Alphabet() {
case NUCLEOTIDS:
Expand Down Expand Up @@ -507,7 +541,9 @@ func (sb *seqbag) appendToSequence(name string, sequence []uint8) error {
return nil
}

func (sb *seqbag) AutoAlphabet() {
// Detects the alphabets compatible with the alignment
// can be align.BOTH, align.NUCLEOTIDS, align.AMINOACIDS, or align.UNKNOWN
func (sb *seqbag) DetectAlphabet() (alphabet int) {
isaa := true
isnt := true

Expand All @@ -532,8 +568,27 @@ func (sb *seqbag) AutoAlphabet() {
})

if isnt {
if isaa {
alphabet = BOTH
} else {
alphabet = NUCLEOTIDS
}
} else {
if isaa {
alphabet = AMINOACIDS
} else {
alphabet = UNKNOWN
}
}
return
}

func (sb *seqbag) AutoAlphabet() {
alphabet := sb.DetectAlphabet()

if alphabet == BOTH || alphabet == NUCLEOTIDS {
sb.alphabet = NUCLEOTIDS
} else if isaa {
} else if alphabet == AMINOACIDS {
sb.alphabet = AMINOACIDS
} else {
sb.alphabet = UNKNOWN
Expand Down
22 changes: 20 additions & 2 deletions cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ var rootinputstrict bool = false
var rootoutputstrict bool = false
var rootoutputoneline = false
var rootoutputnoblock = false
var rootalphabet string
var rootAutoDetectInputFormat bool
var seed int64 = -1
var unaligned bool
Expand Down Expand Up @@ -122,14 +123,26 @@ func readalign(file string) (alchan *align.AlignChannel, err error) {
var fi goio.Closer
var r *bufio.Reader
var format int

var alphabet int
alchan = &align.AlignChannel{}

switch rootalphabet {
case "auto":
alphabet = align.BOTH
case "nt":
alphabet = align.NUCLEOTIDS
case "aa":
alphabet = align.AMINOACIDS
default:
err = fmt.Errorf("given alphabet is not supported: %s", rootalphabet)
return
}

if fi, r, err = utils.GetReader(file); err != nil {
return
}
if rootAutoDetectInputFormat {
if alchan, format, err = utils.ParseMultiAlignmentsAuto(fi, r, rootinputstrict); err != nil {
if alchan, format, err = utils.ParseMultiAlignmentsAuto(fi, r, rootinputstrict, alphabet); err != nil {
return
}
if format == align.FORMAT_PHYLIP {
Expand All @@ -144,13 +157,15 @@ func readalign(file string) (alchan *align.AlignChannel, err error) {
alchan.Achan = make(chan align.Alignment, 15)
go func() {
pp := phylip.NewParser(r, rootinputstrict)
pp.Alphabet(alphabet)
pp.IgnoreIdentical(ignoreidentical)
pp.ParseMultiple(alchan)
fi.Close()
}()
} else if rootnexus {
var al align.Alignment
np := nexus.NewParser(r)
np.Alphabet(alphabet)
np.IgnoreIdentical(ignoreidentical)
if al, err = np.Parse(); err != nil {
return
Expand All @@ -162,6 +177,7 @@ func readalign(file string) (alchan *align.AlignChannel, err error) {
} else if rootclustal {
var al align.Alignment
cp := clustal.NewParser(r)
cp.Alphabet(alphabet)
cp.IgnoreIdentical(ignoreidentical)
if al, err = cp.Parse(); err != nil {
return
Expand All @@ -173,6 +189,7 @@ func readalign(file string) (alchan *align.AlignChannel, err error) {
} else {
var al align.Alignment
fp := fasta.NewParser(r)
fp.Alphabet(alphabet)
fp.IgnoreIdentical(ignoreidentical)
if al, err = fp.Parse(); err != nil {
return
Expand Down Expand Up @@ -215,6 +232,7 @@ func init() {
RootCmd.PersistentFlags().BoolVar(&rootoutputstrict, "output-strict", false, "Strict phylip output format (only used with -p)")
RootCmd.PersistentFlags().BoolVar(&rootoutputoneline, "one-line", false, "Write Phylip sequences on 1 line (only used with -p)")
RootCmd.PersistentFlags().BoolVar(&rootoutputnoblock, "no-block", false, "Write Phylip sequences without space separated blocks (only used with -p)")
RootCmd.PersistentFlags().StringVar(&rootalphabet, "alphabet", "auto", "Alignment/Sequences alphabet: auto (default), aa, or nt")

RootCmd.PersistentFlags().BoolVar(&rootAutoDetectInputFormat, "auto-detect", false, "Auto detects input format (overrides -p, -x and -u)")

Expand Down
1 change: 1 addition & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ Almost all commands can have the following arguments:
3. Clustal
4. Phylip
If none of these formats is recognized, then will exit with an error. Please also note that in `--auto-detect` mode, phylip format is considered as not strict.
* `--alphabet`: Used to specify which alphabet must be used to parse the alignment. It can be `auto` (default), `aa`, or `nt`. By default, the alphabet is deduced from the content of the input file. In the case of nexus format, when `--alphabet auto` is specified, the alphabet specified in the nexus file is used. Otherwise, this option overrides the nexus file alphabet.

Output is written to stdout by default, but can be generally written to files with the `-o` option. If the given output file has a `.gz` or `.xz` extension, the output is compressed accordingly.

Expand Down
70 changes: 52 additions & 18 deletions io/clustal/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
type Parser struct {
s *Scanner
ignoreidentical int
alphabet int // can be align.BOTH, align.AMINOACIDS or align.NUCLEOTIDS
buf struct {
tok Token // last read token
lit string // last read literal
Expand All @@ -25,13 +26,28 @@ type Parser struct {

// NewParser returns a new instance of Parser.
func NewParser(r io.Reader) *Parser {
return &Parser{s: NewScanner(r), ignoreidentical: align.IGNORE_NONE}
return &Parser{s: NewScanner(r), ignoreidentical: align.IGNORE_NONE, alphabet: align.BOTH}
}

// If sets to true, then will ignore duplicate sequences that have the same name and the same sequence
// Otherwise, it just renames them just as the sequences that have same name and different sequences
func (p *Parser) IgnoreIdentical(ignore int) {
func (p *Parser) IgnoreIdentical(ignore int) *Parser {
p.ignoreidentical = ignore
return p
}

// alphabet: can be align.BOTH (auto detect alphabet), align.NUCLEOTIDS (considers alignment as nucleotides),
// or align.AMINOACIDS (considers the alignment as aminoacids). If not auto, can return an error if the alignment
// is not compatible with the given alphabet.
// If another value is given, then align.BOTH is considered
func (p *Parser) Alphabet(alphabet int) *Parser {
p.alphabet = alphabet
if p.alphabet != align.BOTH &&
p.alphabet != align.NUCLEOTIDS &&
p.alphabet != align.AMINOACIDS {
p.alphabet = align.BOTH
}
return p
}

// scan returns the next token from the underlying scanner.
Expand Down Expand Up @@ -69,14 +85,13 @@ func (p *Parser) scanWithEOL() (tok Token, lit string) {
func (p *Parser) unscan() { p.buf.n = 1 }

// Parse parses a clustal alignment
func (p *Parser) Parse() (align.Alignment, error) {
func (p *Parser) Parse() (al align.Alignment, err error) {
var nbseq int = 0
var al align.Alignment
var names []string = make([]string, 0)
var seqs []string = make([]string, 0)
tok, lit := p.scan()
if tok != CLUSTAL {
return nil, errors.New("Clustal alignment file must start with 'CLUSTAL'")
return nil, errors.New("clustal alignment file must start with 'CLUSTAL'")
}
for tok != ENDOFLINE && tok != EOF {
tok, lit = p.scanWithEOL()
Expand All @@ -91,50 +106,57 @@ func (p *Parser) Parse() (align.Alignment, error) {
// Last line of a block
if tok == WS {
if currentnbseqs == 0 {
return nil, errors.New("There should not be a white space here, only at last line of blocks")
return nil, errors.New("there should not be a white space here, only at last line of blocks")
}
if nbseq != 0 && currentnbseqs != nbseq {
return nil, fmt.Errorf("Conservation line: Sequence block nb %d has different number of sequence (%d)",
err = fmt.Errorf("conservation line: Sequence block nb %d has different number of sequence (%d)",
nblocks, currentnbseqs)
return
}
for tok != ENDOFLINE && tok != EOF {
tok, lit = p.scan()
}
if tok != ENDOFLINE {
return nil, errors.New("There should be a new line after degree of conservation line")
err = errors.New("there should be a new line after degree of conservation line")
return
}
tok, lit = p.scanWithEOL()
if tok == EOF {
break
}
if tok != ENDOFLINE {
return nil, errors.New("There should be a new line after degree of conservation line")
err = errors.New("there should be a new line after degree of conservation line")
return
}
tok, lit = p.scan()
if tok == EOF {
break
}
if nbseq != 0 && currentnbseqs != nbseq {
return nil, fmt.Errorf("Sequence block nb %d has different number of sequence (%d)",
err = fmt.Errorf("sequence block nb %d has different number of sequence (%d)",
nblocks, currentnbseqs)
return
}
nbseq = currentnbseqs
nblocks++
currentnbseqs = 0
}

if tok != IDENTIFIER && tok != NUMERIC {
return nil, errors.New("We expect a sequence identifier here")
err = errors.New("we expect a sequence identifier here")
return
}
name = lit
tok, lit = p.scan()
if tok != WS {
return nil, errors.New("We expect a whitespace after sequence name")
err = errors.New("we expect a whitespace after sequence name")
return
}

tok, lit = p.scan()
if tok != IDENTIFIER {
return nil, errors.New("We expect a sequence here")
err = errors.New("we expect a sequence here")
return
}
seq = lit

Expand All @@ -146,27 +168,31 @@ func (p *Parser) Parse() (align.Alignment, error) {
// skip
tok, lit = p.scan()
} else {
return nil, errors.New("We expect a current length after sequence + whitespace")
err = errors.New("we expect a current length after sequence + whitespace")
return
}
}
if tok != ENDOFLINE {
return nil, errors.New("We expect ENDOFLINE after sequence in a block")
err = errors.New("we expect ENDOFLINE after sequence in a block")
return
}

if nblocks == 0 {
names = append(names, name)
seqs = append(seqs, seq)
} else {
if names[currentnbseqs] != name {
return nil, fmt.Errorf("Name at block %d line %d does not correspond to name in first block (%s vs. %s)", nblocks, currentnbseqs, names[currentnbseqs], name)
err = fmt.Errorf("name at block %d line %d does not correspond to name in first block (%s vs. %s)", nblocks, currentnbseqs, names[currentnbseqs], name)
return
}
seqs[currentnbseqs] = seqs[currentnbseqs] + seq
}
currentnbseqs++
}

if len(names) == 0 {
return nil, errors.New("No sequences in the alignment")
err = errors.New("no sequences in the alignment")
return
}

for i, n := range names {
Expand All @@ -180,5 +206,13 @@ func (p *Parser) Parse() (align.Alignment, error) {
}
}

return al, nil
if p.alphabet == align.BOTH {
al.AutoAlphabet()
} else {
if err = al.SetAlphabet(p.alphabet); err != nil {
return
}
}

return
}
Loading

0 comments on commit 634d0ea

Please sign in to comment.