Annotate-a-genome: Difference between revisions

From QiuLab
Jump to navigation Jump to search
imported>Weigang
imported>Weigang
Line 71: Line 71:
=Protocol=
=Protocol=
==Fetch genome sequences==
==Fetch genome sequences==
* Commands
<syntaxhighlight lang="bash" line enclose="div">
<syntaxhighlight lang="bash" line enclose="div">
./bioseq -z 'b31_accession' -o 'genbank' > b31.gb # Reference genome for ortholog identification. Choose main, cp26, or lp54
./bioseq -z 'b31_accession' -o 'genbank' > b31.gb # Reference genome for ortholog identification. Choose main, cp26, or lp54
Line 79: Line 80:
./bioseq -t new.nuc > new.pep
./bioseq -t new.nuc > new.pep
</syntaxhighlight>
</syntaxhighlight>
* Perl code for "gb2fas.pl"
<syntaxhighlight lang="perl" line enclose="div">
#!/usr/bin/env perl
# Extract sequences from a GenBank file
# Input: a GenBank file
# Output: -n: CDS sequences in FASTA; -t: CDS information in Tab-delimited
use strict;
use Bio::SeqIO;
use Getopt::Std;
use Data::Dumper;
use 5.10.0;
my %opts;
getopts('tn',\%opts);
die "$0 [-nt] <genbank_file>\n" unless @ARGV == 1;
my $gb_file = shift @ARGV;
my $in = Bio::SeqIO->new(-file=>$gb_file, -format=>'genbank');
my $cds_ct=0;
while (my $seqobj = $in->next_seq() ) {
    my @features = $seqobj->get_SeqFeatures(); # just top level
    foreach my $feat ( @features ) {
next unless $feat->primary_tag eq "CDS";
$cds_ct++;
&to_db($feat, $cds_ct) if $opts{t};
&to_nt($feat, $seqobj) if $opts{n};
    }
}
exit;
sub to_nt {
    my $ft = shift;
    my $seq = shift;
    say ">", $ft->get_tag_values("locus_tag");
    my $subseq = $seq->trunc($ft->start, $ft->end);
    if ($ft->strand > 0) {
say $subseq->seq();
    } else {
say $subseq->revcom()->seq();
    }
}
sub to_db {
    my $ft = shift;
    my $ct = shift;
    my $orf_id = sprintf "ORF%04d", $ct;
    my $gid = 401;
    my $con_id = 111114823;
    my $locus =  sprintf "%s", $ft->get_tag_values('locus_tag');
    my $prod = sprintf "%s", $ft->get_tag_values('product');
    $prod =~ tr/\'/_/;
    my $strand = ($ft->strand > 0) ? 't' : 'f';
    say join "\t", ($gid, $con_id, $orf_id, 'f', $ft->start, $ft->end, $strand, $locus, $prod);
}
</syntaxhighlight>
==Predict orthologs with reciprocal BLAST==
==Predict orthologs with reciprocal BLAST==
<syntaxhighlight lang="bash" line start="7" enclose="div">
<syntaxhighlight lang="bash" line start="7" enclose="div">

Revision as of 22:22, 13 February 2014

Project Goals

  • Annotate and add newly sequenced Borrelia genomes to BorreliaBase
  • Build an informatics pipeline for gene prediction, ortholog calls, databasing, and synteny analysis

Download genome sequences from GenBank

Strain Species Group Genome Sequences Notes
B31 B. burgdorferi (reference genome) Lyme Disease Example
CA382 B. burgdorferi (California) Lyme Disease Example
CA8 B. burgdorferi (California) Lyme Disease Example
BgVir B. garinii (Russia) Lyme Disease Example
NMJW1 B. garinii (China) Lyme Disease Example
HLJ01 B. afzelii (China) Lyme Disease Example
Ly B. duttonii (Tanzania) Relapsing Fever Example
A1 B. recurrentis (Ethiopia) Relapsing Fever Example
DAH B. hermsii (Washington State) Relapsing Fever Example
91E135 B. turicatae (Texas) Relapsing Fever Example
Achema B. crocidurae (Mauritania) Relapsing Fever Example
HR1 B. parkeri (??) Relapsing Fever Example
LB-2001 B. miyamotoi (Northeast US) Relapsing Fever Example

Protocol

Fetch genome sequences

  • Commands
./bioseq -z 'b31_accession' -o 'genbank' > b31.gb # Reference genome for ortholog identification. Choose main, cp26, or lp54
./bioseq -z 'gb_accession' -o 'genbank' > new.gb
./gb2fas -n b31.gb > b31.nuc # Extract CDS
./gb2fas -n new.gb > new.nuc
./bioseq -t b31.nuc > b31.pep # Translate (and remove those with internal stop codons)
./bioseq -t new.nuc > new.pep
  • Perl code for "gb2fas.pl"
#!/usr/bin/env perl
# Extract sequences from a GenBank file
# Input: a GenBank file
# Output: -n: CDS sequences in FASTA; -t: CDS information in Tab-delimited
use strict;
use Bio::SeqIO;
use Getopt::Std;
use Data::Dumper;
use 5.10.0;

my %opts;
getopts('tn',\%opts);

die "$0 [-nt] <genbank_file>\n" unless @ARGV == 1;
my $gb_file = shift @ARGV;
my $in = Bio::SeqIO->new(-file=>$gb_file, -format=>'genbank');
my $cds_ct=0;
while (my $seqobj = $in->next_seq() ) {
    my @features = $seqobj->get_SeqFeatures(); # just top level
    foreach my $feat ( @features ) {
	next unless $feat->primary_tag eq "CDS";
	$cds_ct++;
	&to_db($feat, $cds_ct) if $opts{t};
	&to_nt($feat, $seqobj) if $opts{n};
    }
}

exit;

sub to_nt {
    my $ft = shift;
    my $seq = shift;
    say ">", $ft->get_tag_values("locus_tag");
    my $subseq = $seq->trunc($ft->start, $ft->end);
    if ($ft->strand > 0) {
	say $subseq->seq();
    } else {
	say $subseq->revcom()->seq();
    }
}

sub to_db {
    my $ft = shift;
    my $ct = shift;
    my $orf_id = sprintf "ORF%04d", $ct;
    my $gid = 401;
    my $con_id = 111114823;
    my $locus =  sprintf "%s", $ft->get_tag_values('locus_tag');
    my $prod = sprintf "%s", $ft->get_tag_values('product');
    $prod =~ tr/\'/_/;
    my $strand = ($ft->strand > 0) ? 't' : 'f';
    say join "\t", ($gid, $con_id, $orf_id, 'f', $ft->start, $ft->end, $strand, $locus, $prod);
}

Predict orthologs with reciprocal BLAST

makeblastdb -in b31.pep -parse_seqids # Prepare the reference DB
makeblastdb -in new.pep -parse_seqids # Prepare the new genome DB
blastp -query new.pep -db b31.pep -outfmt 6 -evalue 1e-3 -out forward_blast.out # Forward BLAST
blastp -query b31.pep -db new.pep -outfmt 6 -evalue 1e-3 -out reverse_blast.out # Reverse BLAST
./check-reciprocal.pl forward_blast.out reverse_blast.out > new.orthlogs 2> new.not-orthologs # Identify orthologs

Verify with synteny broswer

./gb2fas -t new.gb > new-to-orf-table.txt