Mini-Tutorals
BERT classify
- Project start: Winter 2020
- Project team: Saadmanul Islam <Saadmanul.Islam91@myhunter.cuny.edu>
- Tutorial page: https://medium.com/swlh/a-simple-guide-on-using-bert-for-text-classification-bbf041ac8d04
- Data set: bioluminascence
- Goal: cross validation
Estimate LD50 using GLM
## curves
library(dplyr)
library(ggplot2)
library(growthcurver)
library(reshape2)
library(purrr)
setwd("/Users/desireepante/Desktop/Programming")
OD20<-read.csv("OD_0220.csv")
OD220<-OD20[-c(6,11:12,20,26:28, 32:33,45:47),]
od <- filter(OD220, IPTG == 0);
#N15
od15d <- mutate(od15d, od.norm = OD/max(OD))
test.1<-glm(min ~ OD, data= od15d, family= "binomial")
ilink<-family(test.1)$linkinv
test.1.pd <- with(od15d,
data.frame(min = seq(min(min), max(min),
length = 10)))
test.1.pd <- cbind(test.1.pd, predict(test.1, test.1.pd, type = "link", se.fit = TRUE)[1:2])
test.1.pd <- transform(test.1.pd, Fitted = ilink(fit), Upper = ilink(fit + ( se.fit)),
Lower = ilink(fit - ( se.fit)))
test.test<-ggplot(od15d, aes(x = min, y = od.norm)) +
geom_ribbon(data = test.1.pd, aes(ymin = Lower, ymax = Upper, x = min),
fill = "steelblue2", alpha = 0.2, inherit.aes = FALSE) +
geom_line(data = test.1.pd, aes(y = Fitted, x = min)) +
geom_point()
test.test
1k genome project/VCFTools
# Extract APOL1 locus
vcftools --gzvcf ALL.chr22.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz --from-bp 36253071 --to-bp 36267530 --recode --stdout --recode-INFO-all --chr 22 | gzip -c > apol1.vcf.gz
# allele frequencies/counts
vcftools --vcf pvt1.recode.vcf --keep AFR.supergroup --counts --out afr
# pi
# Fst
vcftools --weir-fst-pop AFR.supergroup --weir-fst-pop AMR.supergroup --vcf pvt1.recode.vcf --out AFR-AMR --remove-indels
vcftools --weir-fst-pop AFR.supergroup --weir-fst-pop EAS.supergroup --vcf pvt1.recode.vcf --out AFR-EAS --remove-indels
vcftools --weir-fst-pop AFR.supergroup --weir-fst-pop EUR.supergroup --vcf pvt1.recode.vcf --out AFR-EUR --remove-indels
vcftools --weir-fst-pop AFR.supergroup --weir-fst-pop SAS.supergroup --vcf pvt1.recode.vcf --out AFR-SAS --remove-indels
vcftools --weir-fst-pop AMR.supergroup --weir-fst-pop EAS.supergroup --vcf pvt1.recode.vcf --out AMR-EAS --remove-indels
vcftools --weir-fst-pop AMR.supergroup --weir-fst-pop EUR.supergroup --vcf pvt1.recode.vcf --out AMR-EUR --remove-indels
vcftools --weir-fst-pop AMR.supergroup --weir-fst-pop SAS.supergroup --vcf pvt1.recode.vcf --out AMR-SAS --remove-indels
vcftools --weir-fst-pop EAS.supergroup --weir-fst-pop EUR.supergroup --vcf pvt1.recode.vcf --out EAS-EUR --remove-indels
vcftools --weir-fst-pop EAS.supergroup --weir-fst-pop SAS.supergroup --vcf pvt1.recode.vcf --out EAS-SAS --remove-indels
vcftools --weir-fst-pop EUR.supergroup --weir-fst-pop SAS.supergroup --vcf pvt1.recode.vcf --out EUR-SAS --remove-indels
# concatenate vcf from different chromosomes
vcf-concat apol1-exon-6-snps.vcf tvp23-exon-1-snps.vcf hpr-exon-5-snps.vcf > 3-exons.vcf
# vcf to tabular format
bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT[\t%SAMPLE=%GT]\n' 3-exons.vcf > tmp
vcf-query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT[\t%SAMPLE=%GT]\n' 3-exons.vcf > tmp
Ggplot2 tips
- Data binning and nice histograms with density and boxplot:
# numerical vector as factor so that it is sorted numerically (no need for as.character())
x.df$train.size <- as.factor(x.df$train.size)
# jitter by group
ggplot(x.df, aes(x=rate.cat, y=accuracy, color=group)) + geom_boxplot(position=position_dodge(0.8)) + geom_jitter(position=position_jitterdodge(0.8)) + facet_wrap(~train.size, nrow = 1) + ylim(0,1) + geom_abline(intercept = 0.5, slope = 0, linetype="dashed") + theme(legend.position = "bottom")
Capture results in R/GA example
# in general use
lapply(seq_along(x), function(i) {name=names(x}[i])
# to capture names
# save to a data.frame
# bind into a single data.frame using dplyr function:
x.df <-bind_rows(x.rate)
library(stringdist)
library(ggplot2)
out.fit <- lapply(seq(from = 5,to = 150, by = 5), function (num.epi) {
num.alle <- 20; # num of antigens
epi.fit <- function(x) { # fitness defined by the minimum pair diff
x.list <- split(x, rep(1:num.alle, each=num.epi))
# sum(seq_distmatrix(x.list, method = "hamming"))/(num.alle * (num.alle-1) / 2)/num.epi # average pairwise diff
min(seq_distmatrix(x.list, method = "hamming"))/num.epi # min diff
}
epi.ga <- ga(type = "binary", fitness = epi.fit, nBits = num.epi * num.alle)
c(all = num.alle, epi = num.epi, min.pair.dist = epi.ga@fitnessValue)
})
epi.df <- t(as.data.frame(out.fit))
rownames(epi.df) <- 1:nrow(epi.df)
ggplot(as.data.frame(epi.df), aes(x=epi, y=min.pair.dist)) + geom_point() + geom_line()
NCI Cloud (Seven Bridges)
- Documentation: http://www.cancergenomicscloud.org/controlled-access-data/
- API Documentation (to access meta-data)
- Steps (June 20, 2018)
- Create project
- Data -> Data Overview -> Select cancer type -> Data Browser
- Copy files to project
- Go to project, "add filter", filter by "experimental strategy" (RNA-SEQ, miRNA-SEQ)
- To download files: select & Download link; run
aria2c -i download-links.txt
- To download meta data: Download manifest
- Steps (better)
- Create project
- Search "Public Apps"
- Load data & Run
- Steps (a little hard to follow)
- Create project
- Data -> Data Overview -> Select cancer type -> Data Browser
- Click File to add property (e.g., "miRNA") -> "Copy Files to Project" -> choose project to add & add a tag to the files
- Go to Project -> Files -> filter files by tag name -> select -> Save
- Choose a public protocol/workflow (e.g., RNA-SEQ alignment)
- Add index and GTF files
- Go to Project -> task -> run
Reloop a circular plasmid from PacBio assembly
- Input file: Z9-cp26.fa (containing overlapping ends); b31-cp26.fa (reference)
- Find overlap ends by blasting against itself: blastn -task megablast -query Z9-cp26.fa -subject Z9-cp26.fa -evalue 1e-100 -outfmt 6
- The result says “1 -7617” is the same as “26489-34118”, so we use bioseq to remove overlapping part: bioseq –s”1,26488” Z9-cp26.fa > Z9-cp26.fa2
- Run numcer against b31-cp26 to identify the starting position: nucmer --coord ../b31-cp26.fa Z9-cp26.fa2
- The result says the first base of B31 corresponds to 22171 at Z9-cp26, so we run bioseq to reloop: bioseq –R 22171 Z9-cp26.fa2 > Z9-cp26.fa3
- Run numcer to confirm: nucmer --coord ../b31-cp26.fa Z9-cp26.fa3
KEGG REST API
# Author: Edgaras Bezrucenkovas
# Date: Nov 12, 2017
#KEGG API
#Obtain module pathways and orthologs from KEGG pa14 database
#Import module to fetch the data from KEGG database
import urllib.request
import re #Regular expressions
#Get PA14 modules
kegg_db = "http://rest.kegg.jp/link/module/pau"
pa14_mdata = urllib.request.urlopen(kegg_db).read().decode('utf-8').splitlines()
pa14_mods = []
for line in pa14_mdata:
line = line.split("\t")
mod = line[1].strip("md:pau_")
pa14_mods.append(mod)
#Get PA14 genes
kegg_db = "http://rest.kegg.jp/link/pau/module"
pa14_gdata = urllib.request.urlopen(kegg_db).read().decode('utf-8').splitlines()
pa14_genes = []
for line in pa14_gdata:
gdata = line.split("\t")
gdata[0] = gdata[0].strip("md:pau_")
gdata[1] = gdata[1].strip("pau:")
pa14_genes.append(gdata)
#Open the file containing KEGG compounds ids
id_file = open("kegg-entries.txt")
kegg_ids = id_file.readlines()
compound_ids = []
for cid in kegg_ids:
cid = cid.strip("\n")
compound_ids.append(cid)
id_file.close()
#Get module pathway ids
for cid in compound_ids:
kegg_db = "http://rest.kegg.jp/link/module/compound:"+cid
module_data = urllib.request.urlopen(kegg_db).read().decode('utf-8').splitlines()
module_ids = []
for line in module_data:
if re.match("cpd", line):
line = line.split("\t")
m_id = line[1].strip("md:")
module_ids.append(m_id)
else:
module_ids.append("NA")
#Filter module pathway ids
modules_final = []
for m_id in module_ids:
if m_id == "NA":
modules_final.append("NA")
elif m_id in pa14_mods:
modules_final.append(m_id)
if not modules_final:
modules_final.append("NA")
#Get genes
genes = {}
for m_id in modules_final:
key = m_id
genes.setdefault(key, [])
if m_id == "NA":
genes[key].append("NA")
else:
for i in pa14_genes:
if m_id == i[0]:
genes[key].append(i[1])
#Get module names using module ids
module_names = {}
for m_id in modules_final:
if m_id == "NA":
m_name = {"NA":"NA"}
module_names.update(m_name)
else:
kegg_db = "http://rest.kegg.jp/list/module:"+m_id
module_data = urllib.request.urlopen(kegg_db).read().decode("utf-8").split("\t")
module_name = module_data[1].strip("\n")
m = {m_id:module_name}
module_names.update(m)
#Output
for k, v in genes.items():
for i in v:
print(cid,k,i,module_names.get(k), sep = "\t")
D3js Tutorial
- Install brackets editor from adobe: http://brackets.io/
- Enable web server & create a directory "public_html" in your home directory
- Create the following file structure: test/index.html; test/js/index.js; test/css/index.css
- Basic D3: following this link
- Metabolomics: following this link
Transform a wide data frame to a long one
- A function in R
df2long = function(x, varname, cname, rname){
num.col = dim(x)[2]
x$tmp = rownames(x)
y = reshape(x, varying=1:num.col, v.names=varname, timevar=cname, times=names(x)[1:num.col], direction="long")
y=y[,-4]
rownames(y)=NULL
colnames(y)[1] = rname
y
}
- A function in Perl
#!/usr/bin/env perl
# expect a data frame with row and col headings
use strict;
use warnings;
use Data::Dumper;
my $first = 0;
my @colnames;
my %data;
while(<>) {
chomp;
$first++;
if ($first == 1) {
@colnames = split;
next;
}
my @data = split;
my $rowname = shift @data;
for (my $i=0; $i<=$#data; $i++) {
$data{$rowname}->{$colnames[$i]} = $data[$i];
}
}
#print Dumper(\%data);
foreach my $row (keys %data) {
foreach my $col (@colnames) {
print $row, "\t", $col, "\t", $data{$row}->{$col}, "\n";
}
}
exit;
Variant verification using IGV
- Prepare & load reference genomes
- Load FASTA:
bioseq -i'genbank' ref.gb > ref.fas
- Prepare GFF3 file:
bp_genbank2gff3.pl --CDS --filter exon --filter gene /home/chongdi/michelle/pa_rp73.gb
- Load FASTA:
- Prepare BAM files (see below on how to index and align reads using bwa and samtools, in "ospC Amplicon")
- Index sorted bam files with samtools & load BAM files (multiple bam files okay)
KRAKEN
- assigns taxonomic labels to short DNA reads by examining the k-mers within a read and querying a database with those k-mers:
kraken -db /belfer-ebox/projects/old_backup/qiulab/minikraken_20141208 --fastq-input 02015P1_S18_L001_R1_001.fastq.gz 02015P1_S18_L001_R2_001.fastq.gz --gzip-compressed --output outfile --paired
- summarize taxonomy:
kraken-translate --db /belfer-ebox/projects/old_backup/qiulab/minikraken_20141208 outfile > predict.file
- count unique strains:
pat-8]$ cut -f2 s75.predict | cut -f10 -d';' | grep "^[a-zA-Z]" | sort | uniq -c | sort -rn
. - Pick the strain with the highest counts as the reference genome
PATRIC microbial genome annotation CLI
- Install Debian distribution from github: https://github.com/PATRIC3/PATRIC-distribution/releases/tag/1.015
- Tutorial: http://tutorial.theseed.org/PATRIC-Tutorials/
- Fetch all genomes from a genus:
p3-all-genomes --in genome_name,Borrelia > Borrelia.genomes
;p3-get-genome-data -i Borrelia.genomes -e genome_status,complete > Borrelia-complete.genomes
- Retrieves features from coding sequences only: input file must be a list of genome ids with a header!
p3-get-genome-features -i $file -e feature.feature_type,CDS -a pgfam_id -a patric_id -a plfam_id -a start -a end -a strand -a genome_name -a product -a accession > ${name}-all-features.txt
- Retrieve feature sequence:
p3-echo -t feature.accession "fig|100177.4.peg.1" | p3-get-feature-sequence --dna
- Submit genomes for annotation
- Log in: p3-login user name/password
- Submit a genome for annotation:
submit-patric-annotation --scientific-name "Pseudomonas aeruginosa" --taxonomy-id 287 --genetic-code 11 --domain Bacteria --log pa.log /weigang@patricbrc.org/home/wsdir gid_5.fas
submit-patric-annotation --scientific-name "Streptococcus agalactiae" --taxonomy-id 1311 --genetic-code 11 --domain Bacteria --log sa-1.log /weigang@patricbrc.org/home/wsdir sa-1.fas
- Message: "Uploading gid_5.fas to workspace at /weigang@patricbrc.org/home/wsdir/gid_5/gid_5.fas”
- Create genome groups
- login
cat trep.complete.ids | p3-put-genome-group "Treponema complete”
p3-get-genome-group "Treponema complete”
Parsimony reconstruction using MPR
# MPR for each column
plot.mpr <- function(column=1) {
plot.phylo(tr.unrooted, main = paste(colnames(ph.t)[column]))
tmpr<-MPR(ph.t[,column], tr.unrooted, outgroup = "gid_1311.1320")
nodelabels(paste("[", tmpr[, 1], ",", tmpr[, 2], "]", sep = ""))
tiplabels(ph.t[,column][tr.unrooted$tip.label], adj = -2)
}
# add internal node states
datalist <- data.frame(fam=character(), a=numeric(), b=numeric(), c=numeric())
for (i in 1:ncol(ph.t)) {
#for (i in 1:10) {
tmpr<-MPR(ph.t[,i], tr.unrooted, outgroup = "gid_1311.1320");
out<-c(colnames(ph.t)[i],tmpr[,1])
datalist <- rbind(datalist, data.frame(fam=colnames(ph.t)[i], a=tmpr[1,1], b=tmpr[2,1], c=tmpr[3,1]))
}
#plotting gains and losses
tr.unrooted <- unroot(tr)
gains<-apply(ph.node.states[,9:15], 2, function(x) length(which(x>0)))
losses<-apply(ph.node.states[,9:15], 2, function(x) length(which(x<0)))
plot(tr.unrooted)
edgelabels(gains, adj = c(0.5,-0.25), col=2, frame= "none")
edgelabels(losses, adj = c(0.5,1.25), col=4, frame= "none")
ospC amplicon identification
# BASH script from Tony Fung & Jamie Cabigas (Spring 2019)
#!/usr/bin/env bash
# ----------------------------------------
# File : fas-to-vcf.sh
# Author : Tony Fung & Jamie Cabigas
# Date : March 21, 2019
# Description : Batch convert FAS files to VCF files
# Requirements : samtools 1.9, bcftools 1.9, bwa 0.7.12
# Input : reference file, format of sequence files, up to 7 sequence id's
# Sample Command : bash fas-to-vcf.sh ref.fasta fastq ERR221622 ERR221661 ERR221662 ERR221663
# Output : up to 7 VCF files
# ----------------------------------------
if [ $# -lt 3 ]
then
echo "Usage: bash $0 <ref_file> <format_of_sequence_files> <sequence_id> [<sequence_id_2> <sequence_id_3> etc.]";
exit;
fi
for arg
do
if [ $arg == $1 ]
then
ref_file="../$1";
elif [ $arg == $2 ]
then
file_format=$2;
else
seq_id=$arg;
fas_file_1="${seq_id}_1.$file_format";
fas_file_2="${seq_id}_2.$file_format";
pwd;
cd $seq_id;
pwd;
# Index nucleotide file:
bwa index $ref_file;
# Align:
bwa mem $ref_file $fas_file_1 $fas_file_2 > $seq_id.sam
# BAM output:
samtools view -b $seq_id.sam > $seq_id.bam
# Sort & index BAM file:
samtools sort $seq_id.bam -o $seq_id.sorted.bam
samtools index $seq_id.sorted.bam
# Generate VCF file:
# -u is deprecated, use bcftools mpileup instead
# samtools mpileup -uf $ref_file $seq_id.sorted.bam > $seq_id.mpileup
bcftools mpileup -f $ref_file $seq_id.sorted.bam > $seq_id.mpileup
bcftools call -c -v $seq_id.mpileup > $seq_id.raw.vcf
cd ..;
pwd;
fi
done
# de novo amplicon assembly with cortex_con (quality threshold 10, minimum coverage 100)
~/cortex_con/bin/cortex_con_31 --input cortex-input-file-list --input_format fastq --kmer_size 31 --mem_height 17 mem_width 100 -d cortex-out-2 -q 10 -s 100
# Then run mapsembler to extend on both ends (-t2: contig, -p: prefix, -c: minimum coverage)
~/mapsembler2_pipeline/run_mapsembler2_pipeline.sh -s seq-24r.nuc -r "4_S4_L001_R1_001.fastq.gz 4_S4_L001_R2_001.fastq.gz" -t2 -p map-4 -c 200
#Index nucleotide file:
bwa index ref.fa
# may also be necessary to match reads separately
bioseq --break ref.fa
for f in *.fas; do bwa.index $f; done
#align:
bwa mem ref.fa sample_read1.fq sample_read2.fq > sample.sam
#BAM output:
samtools view -b sample.sam > sample.bam
#Sort & index BAM file:
samtools sort sample.bam sample.sorted
samtools index sample.sorted
# Generate VCF file:
samtools mpileup -uf ref.fa s18.sorted.bam s56.sorted.bam > test.mpileup
bcftools call -c -v test.mpileup > test.raw.vcf
#Extract Coverage by samtools (not good: capped artificially at depth=8k)
samtools depth sample.sorted.bam > sample.depth
# Calculate coverage by bedtools (better, raw read coverages)
bedtools coverage -abam sample.sorted.bam -b refs.bed -d > sample.bedtools.cov # per-site
bedtools coverage -abam sample.sorted.bam -b refs.bed -count > sample.bedtools.cov # average
#R plot:
library(lattice)
x <- read.table("sample.depth", sep="\t", header=F)
colnames(x) <- c("ref", "pos", "cov")
xyplot(log10(cov) ~ pos|ref, data= x, type = "l", main= "sample")
Running a Screen Session
use byobu
- start a screen session by typing "byobu"
- run commands
- Detach by pressing "F6"
- Reattach by typing "byobu"
- Terminate by typing "exit"
use screen
- Start a screen session
screen
- Detach the running mission
ctrl + A + D
- Show the list of running process
screen -ls
- Reattach a running process
screen -r ProcessID
- Terminate a process
ctrl + A
:quit
Bp-utils: sequence, alignment & tree utilities by Qiu Lab
bioseq: sequence/FASTA manipulations
- Use accession "CP002316.1" to retrieve the Genbank file from NCBI. Save the output (in genbank format) to a file named as "cp002316.gb".
bioseq -f "CP002316.1" -o'genbank' > cp002316.gb
- Use the above file as input, extract FASTA sequences for each genes and save the output to a new file called "cp002316.nuc". Use this file for the following questions.
bioseq -i "genbank" -F cp002316.gb > cp002316.fas
- Count the number of sequences.
bioseq -n cp002316.fas
- In a single command, pick the first 10 sequences and find their length
bioseq -p "order:1-10" cp002316.fas | bioseq –l
- In a single command, pick the third and seventh sequences from the file and do the 3-frame translation. Which reading frame is the correct on both? Specify
bioseq -p "order:3,7" cp002316.fas | bioseq -t3
- Find the base composition of the last two sequences
bioseq -p "order:25-26" cp002316.fas| bioseq –c
- Pick the sequence with id "Bbu|D1_B11|8784|9302|1" and count the number of codons present in this sequence
bioseq -p "id:BbuJD1_B11|8784|9302|1" cp002316.fas | bioseq –C
- Delete the last 10 sequences from the file and save the output to cp002316-v2.nuc
bioseq -d "order:17-26" cp002316.fas > cp002316-v2.nuc
- In a single command, pick the first sequence, then get the 50-110 nucleotides and make reverse complement of the sub-sequences
bioseq -p "order:1" cp002316.fas | bioseq -s "50,110" | bioseq –r
- In a single command, get the first 100 nucleotides of all the sequences present in the file and do 1-frame translation of all sub-sequences.
bioseq -s "1,100" cp002316.fas | bioseq -t1
bioaln: alignment/CLUSTALW manipulations
- Go to /home/shared/LabMeetingReadings/Test-Data and find the sequence alignment file “bioaln_tutorial.aln”. Name the format of the alignment file. Use it to answer all the questions below.
CLUSTALW
- Find the length of the alignment.
bioaln -l bioaln_tutorial.aln
- Count the number of the sequences present in the alignment.
bioaln -n bioaln_tutorial.aln
- How do you convert this alignment in phylip format? Save the output.
bioaln -o "phylip" bioaln_tutorial.aln > test.phy
- Pick “seq2, seq5, seq7, seq10” from the alignment and calculate their average percent identity.
bioaln -p "seq2, seq5, seq7, seq10" bioaln_tutorial.aln | bioaln -a
- Get an alignment slice from “50-140” and find the average identities of the slice for sliding windows of 25.
bioaln -s "50, 140" bioaln_tutorial.aln | bioaln -w "25"
- Extract conserved blocks from the alignment.
bioaln -B bioaln_tutorial.aln
- Find the unique sequences and list their ids.
bioaln -u bioaln_tutorial.aln | bioaln -L
- Extract third sites from the alignment and show only variable sites in match view.
bioaln -T bioaln_tutorial.aln | bioaln -v | bioaln -m
- Remove the gaps and show the final alignment in codon view for an alignment slice “1-100”.
bioaln -s "1, 100" bioaln_tutorial.aln | bioaln -g | bioaln -c
- Add a 90% consensus sequence and then show the final alignment in match plus codon view for an alignment slice “20-80”. (Hint: match view followed by codon view)
bioaln -s "20, 80" bioaln_tutorial.aln | bioaln -C "90" | bioaln -m | bioaln -c
biotree: tree/NEWICK manipulations
biopop: SNP statistics
Homology searching and clustering
BLAST+: search("google") for homologs/pariwise alignment
hmmer
cdhit
cdhit -i all.pep -o all.cdhit -c 0.5 -n 3
Options:
- -i: input file
- -o: output file
- -c: percent identity (below which it is considered different families)
- -n: word length
interproscan
../../software/interproscan/interproscan-5.13-52.0/interproscan.sh -i trep-cdhit.representatives.pep2 -o trep-representatives.tsv -t p -goterms -pa -f tsv
Documentation page: How to run
Programs for producing multiple alignments
MUSCLE
MUGSY
- MUGSY bash
#!/bin/sh
export MUGSY_INSTALL=/home/weigang/mugsy_x86-64-v1r2.2
export PATH=$PATH:$MUGSY_INSTALL:$MUGSY_INSTALL/mapping
export PERL5LIB=$MUGSY_INSTALL/perllibs
#For testing TBA
#export PATH=$PATH:$MUGSY_INSTALL/../../multiz-tba/trunk/
- source the bash file
source mugsyenv.sh
- run mugsy
mugsy --directory /home/chongdi/Streptococcus/mugsy-output -prefix mugsy_aln mugsy-input/*.fa
CLUSTALW
MAFT
TCOFFEE
Programs for producing phylogeny & phylogenetic analysis
IQ-Tree
Beginner's tutorial
iqtree -s example.phy -m WAG+I+G -bb 1000
FastTree
PHYLIP
MrBayes
RaXML
- Required arguments
- -s alignment (in PHYLIP or FASTA)
- -n tag
- Simple run (ML):
raxmlHPC-SSE3 -x 12345 -p 12345 -# autoMRE -s concat.fas -m GTRGAMMA -n tag -q part.txt
- Bootstrap:
raxmlHPC-SSE3 -f a -x 12345 -p 12345 -# 100 -s concat.fas -m GTRCAT -n tag -q part.txt
- Make a file named "part.txt" with the following lines (Chanlge the number to the total length of your alignment):
DNA, gene1codon1 = 1-3765906\3 DNA, gene1codon2 = 2-3765906\3 DNA, gene1codon3 = 3-3765906\3
- The resulting files
- RAxML_bestTree.tag: best tree (no bootstrap)
- RAxML_bipartitionsBranchLabels.tag: ignore
- RAxML_bipartitions.tag: main result. Feed this tree to figtree
- RAxML_bootstrap.tag: ignore
- RAxML_info.tag : log file
- Protein models
raxmlHPC-SSE3 -s protein.phy -n A1 -m PROTGAMMAWAS
# protein, gamma, Whelan & Goldman (2001) modelraxmlHPC-SSE3 -s protein.phy -n A2 -m PROTGAMMAGTR
# protein, gamma, user modelraxmlHPC-SSE3 -s protein.phy -n A1 -m PROTGAMMAWAG -# 100 -p 0123
# protein, gamma, Whelan & Goldman (2001) model, bootstrapraxmlHPC-SSE3 -s protein.phy -n A1 -m PROTGAMMAWAG -o Carp,Loach
# protein, gamma, Whelan & Goldman (2001) model, root on a (monophyletic group)raxmlHPC-SSE3 -s protein.phy -n A1 -m PROTGAMMAWAS -# autoFC -p 0123
# protein, gamma, Whelan & Goldman (2001) model, bootstrap (at least 99 splits, auto-stopping)raxmlHPC-SSE3 -f a -s protein.phy -n A1 -m PROTGAMMAWAG -# 100 -p 0123 -x 0123
# Rapid bootstrap with consensus- output 1, RAXML_bestTree.A1 (ml tree)
- output 2, RAXMLbootstrap.A1 (bootstrap relicates)
- output 3, RAXMLbipartitions.A1 (tree with boot strap values)
PhyloNet
R packages for phylogenetics
APE
Convert a tree to ultrametric using time estimates
t <- read.tree("core-tree-30otus.dnd")
t.ultra <- chronopl(t, 0)
t.hclust <- as.hclust(t.ultra)
t.dendro <- as.dendrogram(t.hclust)
heatmap(ge.var, Rowv=t.dendro) # order the rows with customized tree
phengorn
phytools
Population genetics
LDHat: test of recombination based on 4-gametes
- filter sites:
convert -seq snps.sites -loc snps.locs -freqcut 0.08
- Pairwise LD tests: First generate "lookup" table:
lkgen -lk /usr/local/LDhat/lk_files/lk_n50_t0.001 -nseq 48
; then calculate pairwise LD statistics:pairwise -seq sites.txt -loc locs.txt -lk new_lk.txt
- Run interval for recombination hot spots:
interval -seq sites.txt -loc locs.txt -lk new_lk.txt -its 1000000 -bgen 5 -exact
- Get stats:
stat -input rates.txt -loc locs.txt -burn 50 (burnin=100K)
- Plot in R:
x = read.table("rates.txt", skip=1, fill=T);
x = as.matrix(x);
burn.in = 50;
low = as.integer(nrow(x)*burn.in/100);
means<-apply(x[low:nrow(x),], 2, mean, na.rm=T);
q.95<-apply(x[low:nrow(x),], 2, quantile, probs=c(0.025, 0.5, 0.975), na.rm=T);
pos<-as.vector(as.matrix(read.table("locs.txt", as.is=T, skip=1)));
plot(pos[1:(length(pos)-1)], y=means[2:length(means)], type="s", col=rgb(0,0,0.5),
xlab="SNP position (B111)", ylab="Posterior mean recombination rate", main="GBS Strep: recombination by LDhat");
lines(pos[1:(length(pos)-1)], y=q.95[1,2:length(means)], type="s", col=grey(0.75), lty="dotted");
lines(pos[1:(length(pos)-1)], y=q.95[3,2:length(means)], type="s", col=grey(0.75), lty="dotted");
ms: coalescence simulation
SFS: forward simulation
PAML: testing selection with Ka/Ks
Microbial genome databases & pipelines in Qiu Lab
borreliabase
pa2
spiro_genomes/treponema
Blast a set of genes against a bacteria genome
- download genome
- extract gene sequences & translate
- Make blast database
- Run blastp
de novo variant call with cortex_var
Create binary file of fasta genome file.
Run contex_var_31_c1 (cutoff 1 used for 1 genome)
- --se_list is the command the reads the list you want to target (ie: list-genome.txt)
- --kmer_size is the middle size, has to be an odd integer
- --mem_width always choose 17
- --mem_height always choose 100
- --dump_binary Name your file name (ie: Genome.ctx)
/home/weigang/CORTEX_release_v1.0.5.21/bin/cortex_var_31_c1
--se_list list-Evo.txt
--kmer_size 31
--mem_width 17
--mem_height 100
--dump_binary Evo.ctx
> Evo.log save log file
Read each binary file (.ctx) into its own individual color list (ls Evo.ctx > Evo.colorlist) Then save these lists into their own collective colorlist.txt (ls *.ctx > colorlist.txt)
Reveal genetic variation using the Bubble Caller from cortex_var.
/home/weigang/CORTEX_release_v1.0.5.21/bin/cortex_var_31_c5
--se_list colorlist.txt
--kmer_size 31
--mem_width 17
--mem_height 100
--dump_binary all-colors.ctx
> all-colors.log save log file
Bubble caller will detect differences between each genome by assigning distinct colors to each genome (note that the UK spelling of color is used: colour)
- --multicolour_bin holds your all-colors.ctx binary from the Bubble Caller
- --detect_bubbles1 i/i Detects 1 variation between genomes i and i. i indicates the position number the genome is listed on the colorlist.txt file. If the genome is fourth on the colorlist.txt, for example, its corresponding i variable is 4
- --output_bubbles1 Output variant reads in fasta format (ie: Evo-RefHG.var for bubble detection between
Evolved genome and Reference HG genome)
- --print_colour_coverages necessary for output
/home/weigang/CORTEX_release_v1.0.5.21/bin/cortex_var_31_c5
--kmer_size 31
--mem_height 17
--mem_width 100
--multicolour_bin all-colors.ctx
--detect_bubbles1 0/1
--output_bubbles1 Evo-RefHG.var
--print_colour_coverages
> Evo-RefHG.log save log file
Variant call with cortex_var
Example
- Files
MS00018_S5_L001_R1_001.fastq
MS00018_S5_L001_R2_001.fastq
KU-090401_S3_L001_R1_001.fastq
KU-090401_S3_L001_R2_001.fastq
...
- File content
@M03268:52:000000000-AJFAY:1:1101:16970:1555 1:N:0:7
CCCATGAACGGCACGTTCACGATGCAGAAGGTGGTGACCAGGCCGGTGTCGGCGACCTCGACGTAATCGTCGGTGGGGGA
GCCGTCGCGCGGGTCCGCGCCGCGCGGCGGGAAGTACACCTTGCCGTCCATGCGGGCGCGGGCGCCCAGCAGACGACCCT
CCATGAGCGCATTCAGGAATTCGGCCTCCTGCGGCGCGGCGGTGTGCTTGATATTGAAGTCGACCGGGGTCACGATGCCG
GTGACCGGTT
+
11>1>111@11>1A1FGF1FC0F0A111010GB0ECBFGF0/AFECGGFHECE??EEGHE/EEEHEFHEHHGCEECE??/
<CFCGGCCGCCCCCCHCGGCCCG@G??@@?@-@BFFFFFFFFFFF;B@FBFBFF<?@;@@-=?@??@-EFFFFF@;@@@F
FFBF/BBF;@@FFFFFFFFFFFFF@FFFFFFF<@@@@@@@@@@@FBFFFFFFFFFFFFFFFFF@@@?@?@FFFFFFFBF@
?@@EFF@@<B
@M03268:52:000000000-AJFAY:1:1101:16136:1618 1:N:0:7
TCCTGGCCCGTGAAACCGCTTGCCCGGTACAGGTTCTGGACTACCGCCTGGCACCCGAGCATCCGTTCCCGGCGGCGCTC
GACGACGCCGAGGCGGCGTTCCTGGAACTGGTGGCCGCCGGCTACCGGCCCGAACGCATTGCGGTCGCGGGTGATTCGGCCGGTGGCGGGCTCTCGCTCGCGCTGGCCGAACGGCTGCGCGACCGGCACGGGCTGGTTCCGGCCGCGCTCGGGCTGATCGCGCCCTGGGC
+
11>>11C1A@A?11BDF?EEGAFGFG?ECCHBHHGFG1EGHFHHHCCGGC/GCGGGCEECEFGHGGFFGHGGGGCECCC<CCC@CCCC@CGCCCGGC?:@EBFBF/CFFFF0/CFG?=@=-9>AFF@=@@?@;@@@F--9:BF@A@@?@--;9-BFFFF@A-@99B?-9@?=EFFBAAE;9>?;@@BF@9-@-;@=-E@@@=@@;@?>@9@<?@?BBBFFF;?>;;-@@?@<?@?9-FBFF@-99-9E-9
...
Step 1
- Create matched FASTQ files (python script)
#!/usr/bin/python
from sys import argv
script, File1, File2 = argv
# Create a dictionary listing the sequences in the first file for reference
file1 = open(File1)
dict1 = {}
for line in file1:
if '@M03268' in line:
tag1 = line.rstrip()[:-9]
tail = line.rstrip()[-9:]
dict1[tag1] = []
else:
dict1[tag1].append(line.rstrip())
file1.close()
# Create two output files
f1 = open(File1.replace('.fastq', '_mat.fq'), 'w')
f2 = open(File2.replace('.fastq', '_mat.fq'), 'w')
# Match the sequence
file2 = open(File2)
for line in file2:
if dict1.has_key(line.rstrip()[:-9]): # The has_key method
tag1 = line.rstrip()[:-9]
f1.write(tag1 + tail + '\n')
for j in range(3):
f1.write(dict1[tag1][j] + '\n')
del dict1[tag1]
dict2 = {} # Create a temporary dictionary for sequence in the file2
tag2 = line.rstrip()
dict2[tag2] = []
else:
dict2[tag2].append(line.rstrip())
if len(dict2[tag2]) == 3:
f2.write(tag2 + '\n')
for j in range(3):
f2.write(dict2[tag2][j] + '\n')
file2.close()
f1.close()
f2.close()
Step 2. Clean Fastq Files & Run Single-color Graph & Error Cleaning
- Create a file list showing all outcome files whose extensions need to be changed from _mat.fq to .list
bbduk.sh -Xmx1g in1=fastq_file1 in2=fastq_file2 -out1=clean1.fq -out2=clean2.fq qtrim=rl trimq=20
- Create a file list showing all outcome files whose extensions need to be changed from _mat.fq to .list
for f in *_mat.fq;
do
title=$(echo $f | cut -d'_' -f2);
id=$(echo $f | cut -d'_' -f1);
echo $f > ${id}.list${title};
done
- Single color graph for sample
../../CORTEX_release_v1.0.5.21/bin/cortex_var_31_c1
--pe_list MS00018_S5.list1,MS00018_S5.list2
--kmer_size 31
--mem_height 17
--mem_width 100
--dump_binary MS00018_S5.ctx
--sample_id MS00018_S5
--remove_pcr_duplicates
--quality_score_threshold 20 > MS00018_S5.log
- Single color graph for reference
../../CORTEX_release_v1.0.5.21/bin/cortex_var_31_c1
--se_list ref.filelist
--kmer_size 31
--mem_height 17
--mem_width 100
--dump_binary ref.ctx
--sample_id ref 20 > ref.log
- Run Error Cleaning for All Samples (reference is not included)
../../CORTEX_release_v1.0.5.21/bin/cortex_var_31_c1
--mem_height 18
--mem_width 100
--kmer_size 31
--multicolour_bin N18_S15.ctx
--remove_low_coverage_supernodes 10
--dump_binary N18_S15.cleaned.ctx
Step 3
- Pull the name of each .cleaned.ctx file to a cleaned.list file, then create a .filelist file for all cleaned.list files.
ls file1.cleaned.ctx > file1.cleaned.list
ls file2.cleaned.ctx > file2.cleaned.list
ls ref.ctx > ref.list
ls -1 *.list > ref-sample.filelist
- Multicolour Graph
../../CORTEX_release_v1.0.5.21/bin/cortex_var_31_c3
--mem_height 20
--mem_width 100
--kmer_size 31
--colour_list ref-sample.filelist
--dump_binary ref-sample.ctx > ref-sample.log
Step 4
- Variation Discovery Using The Bubble Caller
../../CORTEX_release_v1.0.5.21/bin/cortex_var_31_c3
--mem_height 20
--mem_width 100
--kmer_size 31
--multicolour_bin ref-sample.ctx
--detect_bubbles1 -1/-1
--ref_colour 2
--output_bubbles1 bubbles-in-sample.out
--print_colour_coverages
--experiment_type EachColourAHaploidSampleExceptTheRefColour
--genome_size 8000000 > bubbles-in-sample.log
Step 5
- Reference Genome
(When in Cluster execute "module load stampy": doesn't work; path problem) (Run the following on wallace:)
stampy.py -G ref ref.fa
stampy.py -g ref -H ref
- Turn Into VCF with reference
Make a sample list file (from bubble or multicolor log file):
cat e1-bubbles-in-sample.log | grep CLEANED | cut -f2 > e1.sample.lis
Customize the following command based on your output files, num of colors, index of ref colors, etc
perl /home/weigang/CORTEX_release_v1.0.5.21/scripts/analyse_variants/process_calls-wq.pl --callfile e1-bubbles-in-sample.out --callfile_log e1-bubbles-in-sample.log -outvcf e1-bubbles-in-sample --outdir e1-vcfout --samplename_list e1.sample.list --num_cols 7 --stampy_bin /home/weigang/stampy-1.0.28/stampy.py --stampy_hash ref --refcol 6 --vcftools_dir /usr/local/bin --caller BC --kmer 31 --ploidy 1
Step 6. Parse VCF files
- Filter out low-quality sites:
vcftools --vcf pat-5.decomp.vcf --keep-filtered PASS --recode --out pat-5
- Extract coverage
vcftools --vcf pat-5.recode.vcf --extract-FORMAT-info COV
(output file: out.COV.FORMAT)
- Extract genotypes
vcftools --vcf pat-5.recode.vcf --extract-FORMAT-info GT
(output file: out.GT.FORMAT)
- Extract confidence
vcftools --vcf pat-5.recode.vcf --extract-FORMAT-info GT_CONF
(output file: out.GT_CONF.FORMAT)
- Send GT_CONF file to R and visualize log10(conf) distribution with boxplot
- Use custom PERL file to filter out low-quality (e.g., GT_CONF < 30) genotype calls (flag with "?" or "na"), and make haplotype GT ("1/1" to "1", "0/0" to "0", "./." to "?)
- Verify variants using IGV (see IGV protocol above)
Step 7. Variant Annotation & Visualization
vcf_parser.py out.decomp.vcf ref.gb
(the code needs validation)- Data to database & web visualization, if necessary
hmmer
Annotate proteins with TIGRFAM
hmmsearch --tblout foo.hmmout # table output for all sequences
--domtblout foo.dmout # table output for all domains
-E 0.01 # level of sequence significance
--domE 0.01 # level of domain significance
-o /dev/null # don't show STDOUT
../../TIGRFAMs-Release-15-Sep-17-2014/TIGRFAMs_15.0_HMM.LIB # HMM profile library for tiger fams
GCA_000583715.2_ASM58371v2_protein.faa & # input/query file in FASTA
PopGenome
library(PopGenome)
g = readVCF("pvt1.recode.vcf.gz", 1000, "8", 127890000, 128102000)
pops = split(sample[,1], sample[,2]) # create a list of populations
g = set.populations(g, pops, diploid = T) # set population names
# by windows
slide = sliding.window.transform(g, width = 100, jump = 20) # nsnps, not actual length
slide = F_ST.stats(slide, mode = "nucleotide")
snp.pos = slide@region.data@biallelic.sites # SNP positions
win.num = length(slide@region.names)
win.start = numeric()
for (i in 1:win.num) {win.start[i] = snp.pos[[i]][1]}
fst = slide@nuc.F_ST.vs.all
pop.names = names(slide@populations) # population names
plot(win.start, fst[,1], type ="n", las = 1, ylab = expression(F[st]), xlab = "SNP Position", ylim = c(0,0.4))
for (i in 1:length(slide@populations)) {
lines(win.start, fst[,i], type = "l", col = pop.group[pop.names[i],4])
}
arch.coords=c(127982050, 127992931)
abline(v = arch.coords, col = "orange")
#rect(xleft = arch.coords[1], ybottom = -1, xright = arch.coords[2], ytop = 0.5, border = "transparent", col = 2)
Velvet
Cleaning
../../bbmap/bbduk.sh -Xmx1g
in1=WGC067462_hunhewenku_509_combined_R1.fastq # gz file works as well
in2=WGC067462_hunhewenku_509_combined_R2.fastq # gz file works as well
-out1=clean1.fq
-out2=clean2.fq
qtrim=rl
trimq=20
Running Velvet Optimizer
srun ../../VelvetOptimiser-2.2.5/VelvetOptimiser.pl
--t 32
--s 31 --e 31 --x 6 # kmer sizes
-f '-shortPaired -fastq clean1.fq -shortPaired2 -fastq clean2.fq'
-t 4
--optFuncKmer ‘n50’
-p prefix
PacBio assembly with canu
./canu -p staph-auto-5 -d staph-auto-5 genomeSize=2.2m -pacbio-raw pac-reads-5.tar.gz