Mini-Tutorals: Difference between revisions
imported>Weigang m (→interproscan) |
(→R tips) |
||
(240 intermediate revisions by 5 users not shown) | |||
Line 1: | Line 1: | ||
=Weblogo= | |||
* install weblogo from [https://github.com/WebLogo/weblogo github] | |||
* Or install with conda: <code>conda install -c conda-forge weblogo</code> | |||
* Or install with pip: <code>pip install weblogo</code> | |||
* run the following command | |||
<syntaxhighlight lang="bash"> | |||
weblogo -f IR4-pep.fas -o ir4-logo.pdf -D fasta -F pdf -A protein -s large -t "IR4" -c chemistry | |||
</syntaxhighlight> | |||
=Tree visual with ggtree= | |||
<syntaxhighlight lang="sas"> | |||
setwd("C:/Users/Weigang/Dropbox/Natasha-files/") | |||
library(ggtree) | |||
library(treeio) | |||
library(tidyverse) | |||
# read tree | |||
tree <- read.tree("asian_clade3_v2.nwk") | |||
# create a tibble of OTU groups | |||
tips <- tibble(id = tree$tip.label, | |||
origin = c(rep("Eurasia",4), "US", "Eurasia", | |||
"US", rep("Eurasia",25))) | |||
# plot tree | |||
p <- ggtree(tree) + | |||
xlim(0,0.02) + # to avoid overflow of labels | |||
geom_treescale(x=0, y=30) | |||
# join tree and add group color | |||
p %<+% tips + | |||
geom_tiplab(aes(color = origin)) + | |||
scale_color_manual(values = c("Eurasia" = 1, "US" = 2)) + | |||
theme(legend.position = "none") | |||
</syntaxhighlight> | |||
=Github for sharing data and source code= | |||
* SSH setup (github no longer allows password-based push to remote repository) | |||
# [https://docs.github.com/en/github/authenticating-to-github/connecting-to-github-with-ssh/generating-a-new-ssh-key-and-adding-it-to-the-ssh-agent Create a new SSH key on your computer] | |||
# [https://docs.github.com/en/github/authenticating-to-github/connecting-to-github-with-ssh/adding-a-new-ssh-key-to-your-github-account Add the key to your Github account] | |||
# [https://gist.github.com/xirixiz/b6b0c6f4917ce17a90e00f9b60566278 Commit with SSH] | |||
## Authentication/test: <code>ssh -T git@github.com</code> | |||
## Add repository for commit: <code>git remote set-url origin git@github.com:bioperl/p5-bpwrapper.git</code> | |||
* Developer workflow | |||
** Clone remote repository: <code>git clone <rep-name.git></code> | |||
** Sync local to remote repository: <code>git pull</code> | |||
** Check local repository status: <code>git status</code> | |||
** Show the latest commit: <code>git log</code> | |||
** Add new file: <code>git add <filename> </code> | |||
** Commit changes to local repository: <code>git commit -a -m "message"</code> | |||
** Push to remote repository: <code>git push</code> | |||
=bcftools pipeline= | |||
bcftools view -m2 -M2 --types snps calls.bcf > snps.bcf | |||
vcftools --vcf snps4.vcf --maf 0.01 --recode --recode-INFO-all --out maf (n=33 SNPs) | |||
bcftools query -l maf.recode.vcf > samples (n=17775 samples) | |||
cat samples | while read line; do echo ">$line"; bcftools query -s "$line" -f '[%TGT]' maf.recode.vcf; echo; done > samples-maf.fas | |||
=R tips= | |||
==Add expected normal curvce to histogram== | |||
<syntaxhighlight lang='bash'> | |||
plot.trait <- function(qt.out) { | |||
phe <- qt.out[[4]]$phe | |||
exp.mean <- qt.out[['trait.exp']][1] | |||
exp.sd <- sqrt(qt.out[['trait.exp']][2]) | |||
x <- seq(min(phe), max(phe), length = 100) | |||
fun <- dnorm(x, mean = exp.mean, sd = exp.sd) | |||
hist(phe, probability = T, ylim = c(0, max(fun)), br = 50, las = 1) | |||
lines(x, fun, col = 2, lwd = 2) | |||
} | |||
</syntaxhighlight> | |||
==Run batch contingency table tests== | |||
# Numbered list item | |||
<syntaxhighlight lang='bash'> | |||
snp_ct <- snp_long %>% group_by(POS, geno, virulence) %>% count() | |||
# get pos with < 4 entries: | |||
bad_pos <- snp_ct %>% group_by(POS) %>% count() %>% filter(n < 4) | |||
library(broom) | |||
# batch fisher's exact test | |||
snp_fisher <- snp_long %>% | |||
filter(!POS %in% bad_pos$POS) %>% | |||
group_by(POS) %>% | |||
do(tidy(xtabs(~geno + virulence, data = .) %>% | |||
fisher.test(simulate.p.value = T))) | |||
snp_fisher <- snp_fisher %>% mutate(y = -log10(p.value)) | |||
# plot vocano | |||
snp_fisher %>% | |||
ggplot(aes(x = estimate, y = y)) + | |||
geom_point(shape = 1, alpha = 0.5) + | |||
scale_x_log10() + | |||
geom_vline(xintercept = 1, linetype = 2, color = "red") + | |||
theme_bw() + | |||
xlab("odds ratio (log10)") + | |||
ylab("signficance (-log10[p])") | |||
</syntaxhighlight> | |||
==multiple regression models (with broom::tidy)== | |||
<syntaxhighlight lang='bash'> | |||
library(broom) | |||
compensation.models <- compensation %>% group_by(Grazing) %>% do(tidy(lm(Fruit ~ Root, data = .))) %>% filter(term != '(Intercept)') | |||
</syntaxhighlight> | |||
==multiple regression models (with nested tibble)== | |||
<syntaxhighlight lang='bash'> | |||
# build multiple models, one for each serum: | |||
by_serum <- two_od %>% group_by(Serum) %>% nest() # nested data frame, one row per serum | |||
x_model <- function(df) { # model function (similar to lapply) | |||
x <- df %>% filter(OspC!='A02' & OspC!='A04') | |||
# x <- df %>% filter(OspC!='A02' & OspC!='A04' & OspC!='N14') | |||
lm(OD1 ~ OD2, data = x) | |||
} | |||
by_serum <- by_serum %>% mutate(model = map(data, x_model)) # add model to each serum | |||
output <- vector("list") | |||
for(i in 1:length(by_serum$Serum)) { | |||
df <- by_serum$data[[i]] | |||
model <- by_serum$model[[i]] | |||
pd <- predict.lm(model, newdata = data.frame(OD2 = df$OD2)) | |||
output[[i]] <- data.frame(OspC = df$OspC, | |||
OD2 = df$OD2, | |||
OD1 = df$OD1, | |||
OD1_pred = pd, | |||
Serum = rep(by_serum$Serum[i], nrow(df)) | |||
) | |||
} | |||
df.out <- bind_rows(output) | |||
</syntaxhighlight> | |||
==multiple regression models (with custum functions)== | |||
<syntaxhighlight lang='bash'> | |||
# build model | |||
models <- xy %>% split(.$Serum) %>% map(~lm( aff_PL ~ aff_C3H, data= .)) | |||
# get r-squared: | |||
models %>% map(summary) %>% map_dbl("adj.r.squared") | |||
# get p values (of slope, using a custom function): | |||
get.pvalue <- function(x){ | |||
s <- summary(x); | |||
s$coefficients[2, 4] | |||
} | |||
models %>% map(get.pvalue) %>% unlist() | |||
# get slope: | |||
get.slope <- function(x){ | |||
s <- summary(x); | |||
s$coefficients[2, 1] | |||
} | |||
models %>% map(get.slope) %>% unlist() | |||
# get intercept: | |||
get.intercept <- function(x){ | |||
s <- summary(x); | |||
s$coefficients[1, 1] | |||
} | |||
models %>% map(get.intercept) %>% unlist() | |||
</syntaxhighlight> | |||
=Microbial genome alignment with MUMMER4 and other tools= | |||
==SARS-CoV-2 GISAID pipleline== | |||
<pre> | |||
####################################### | |||
# Parse GISAID sequences fasta | |||
###################################### | |||
1. bioseq -B cov.fas # burst into individual files | |||
1a. move the un-bursted file out the "human-files" directory!!! | |||
mv cov-human.fas ../ | |||
2. sam align (weigang@wallace:~/cov-03-09-2030/host-human$ for f in *.fas; do ../sam-align.bash $f; done ) | |||
nucmer --sam-long=COH1 B111.fa COH1.fa | |||
samtools view -b COH1.sam -T B111.fa > COH1.bam | |||
samtools sort COH1.bam -o COH1.sorted.bam | |||
samtools index COH1.sorted.bam | |||
with script: for f in *.fas; do ../sam-align.bash $f; done | |||
3c. Less strict call: bcftools mpileup -Ou -f ../ref.fas *.sorted.bam | bcftools call -mv --ploidy-file ploidy.txt -Ob -o calls.bcf -P 0.05 (or -P 0.1; large P value for less strict call, default 1.1e-3) | |||
# 3b. bcftools mpileup -Ou -f ../ref.fas *.sorted.bam | bcftools call -mv --ploidy-file ploidy.txt -Ob -o calls.bcf, with default ploidy as 1: | |||
weigang@wallace:~/cov-03-09-2020/cov57$ cat ploidy.txt | |||
* * * M 1 | |||
6a. bcftools view -m2 -M2 --types snps calls.bcf > snps.bcf ( get only biallelic SNPs) | |||
6b. vcftools --vcf input_file.vcf --remove-indels --recode --recode-INFO-all --out cov.vcf # filter SNPs only | |||
7. filter sites by allele counts: only informative sites | |||
bcftools view snps.bcf > snps.vcf | |||
vcftools --vcf snps.vcf --mac 2 --recode --recode-INFO-all --out snps2.vcf | |||
</pre> | |||
==Pipeline== | |||
<pre> | |||
###################### | |||
Host & Enviroment: wqiu@bioit.hunter.cuny.edu | |||
vcftools & clonalframe "source activate gbs" | |||
bcftools: "source activate ngs" | |||
1. align fastq to ref | |||
1) index reference genome | |||
bwa index B111.fa | |||
=> 5 files: amb, ann, bwt, pac, sa | |||
2) align reads to ref | |||
ls -1 *.gz | cut -d'.' -f1 | sort -u > sample.list | |||
byobu | |||
cat sample.list | while read line; do bwa mem B111.fa ${line}.R1.fastq.gz ${line}.R2.fastq.gz > $line.sam; done | |||
=> sam file | |||
2. convert to bam, sort by ref coordinates | |||
cat sample.list | while read line; do samtools view -F 4 -Sbh ${line}.sam | samtools sort -o ${line}.bam; done | |||
=> acc.bam | |||
1+2: wrapped to save space | |||
cat sample.list | while read line; do | |||
begin=$(date +"%D:%H:%M") | |||
echo -ne "$line ... started $begin ..."; | |||
bwa mem B111.fa ${line}.clean.1.fastq.gz ${line}.clean.2.fastq.gz > $line.sam 2> /dev/null; | |||
bwa_time=$(date +"%D:%H:%M") | |||
echo -ne "sam file generated: $line.sam on $bwa_time ..."; | |||
samtools view -F 4 -Sbh ${line}.sam | samtools sort -o ${line}.bam 2> /dev/null; | |||
bam_time=$(date +"%D:%H:%M") | |||
echo -ne "bam file generated: $line.bam on $bam_time ..."; | |||
rm $line.sam; | |||
echo "sam file deleted. Done"; | |||
done | |||
3. index | |||
cat sample.list | while read line; do samtools index ${line}.bam; done | |||
=> acc.bam.bai | |||
4. Variant Call | |||
1) make ploidy.txt file | |||
cat > ploidy.txt | |||
* * * M 1 | |||
2) pileup and make bcf file | |||
bcftools mpileup -Ou -f B111.fa *.bam | bcftools call -mv --ploidy-file ploidy.txt -Ob -o calls.bcf -P 0.05 | |||
((or -P 0.1; large P value for less strict call, default 1.1e-3)) | |||
=> calls.bcf | |||
5. analyse calls.bcf; remove outlier SNPs and samples | |||
1) check stats | |||
bcftools stats calls.bcf > call.stats | |||
ts/tv (transition/transversion) better greater than 3 | |||
check AF freq, remove the first class if necessary (e.g., vcf --maf 0.008) | |||
check counts: | |||
vcftools --vcf snps.vcf --counts --out snps | |||
Filter by maf counts (at least 2): | |||
vcftools --vcf snps-255.recode.vcf --recode --recode-INFO-all --out snps2 --mac 2 | |||
2) get only biallelic SNPs | |||
bcftools view -m2 -M2 --types snps calls-87.bcf > snps-87.vcf | |||
bcftools stats snps.vcf | ll | |||
bcftools view -m2 -M2 --types snps call-49.vcf > snps-49.vcf | |||
3) merge samples | |||
bgzip snps-87.vcf | |||
bgzip snps-49.vcf | |||
=> vcf.gz file | |||
bcftools index snps-87.vcf.gz | |||
bcftools index snps-49.vcf.gz | |||
=> vcf.gz.csi file | |||
bcftools merge -Ob snps-87.vcf.gz snps-49.vcf.gz > snps-136.bcf | |||
get only biallelic SNPs | |||
bcftools view -m2 -M2 --types snps snps-136.bcf > snps-136.vcf | |||
ln -s snps-136.vcf snps.vcf | |||
# use vcftools to remove invariant sites, in case samples are dropped: | |||
vcftools --gzvcf snps2.vcf.gz --non-ref-ac-any 1 --recode --recode-INFO-all --out tmp --remove-indv IND | |||
6. vcf to fasta | |||
1) modify sample name (should use "bcftools reheader -s change-id.txt -o tmp.vcf snps.vcf") | |||
cat snps.vcf | sed 's/.bam//g; s/.sorted//g; s/batch-..//g' > tmp.vcf | |||
mv tmp.vcf snps-136.vcf | |||
2) sample list | |||
bcftools query -l snps.vcf > sample.txt | |||
3) make fasta: could be gzipped | |||
cat sample.txt | while read line; do echo ">$line"; bcftools query -s "$line" -f '[%TGT]' snps.vcf; echo; done > sample.fas | |||
Note: . in fas file means missing nt, handled correctly iqtree | |||
4) get ref seq | |||
echo ">B31" > ref.fas | |||
ll snps.vcf => get ref id | |||
grep "^Chromosome" snps.vcf | cut -f4 | paste -s -d '' - >> ref.fas | |||
#grep "^CP021772" snps.vcf | cut -f4 | paste -s -d '\0' - >> ref.fas (#if on apple system) | |||
cat ref.fas >> sample.fas | |||
=> 137 samples, 46783 snps | |||
go to genometracker: | |||
scp sample.fas snps-136.vcf weigang@genometracker.org:/mnt/bac_genome/GBS-Wu-Oct-2022/. | |||
on genometracker: | |||
ln -s snps-136.vcf snps.vcf | |||
# subset samples (e.g., bbss only): | |||
bcftools view -Ov -S samples-bbss.txt --force-samples snp5-lp17.vcf > snp5-lp17-bbss.vcf | |||
7. make tree (with iqtree, for SNP-only alignment, with bootstrap; can't have constant sites) | |||
iqtree2 -s sample.fas -m GTR+ASC -B 1000 -nt AUTO (-n AUTO to use all CPUs on the cluster) | |||
# if containing full seq alignment with constant sites, e.g., HIV env gene alignment | |||
# iqtree -s $work_dir/test.fas -m GTR+F+G -B 1000 --redo -nt AUTO | |||
=> sample.fas.contree | |||
8. consequence call | |||
1) download ref in genbank format | |||
NCBI nucleotide database: search CP021772 | |||
=> B111.gb | |||
2) convert genbank to gff3 format | |||
(1) py file: modified from https://dmnfarrell.github.io/bioinformatics/bcftools-csq-gff-format | |||
=> gb2gff3.py | |||
(2) ./gb2gff3.py B111.gb | |||
=> B111.gff3 | |||
(3) change acc to match ref name in snps.vcf | |||
cat B111.gff3 | sed 's/^CP021772.1/CP021772/' > tmp.gff3 | |||
mv tmp.gff3 B111.gff3 | |||
3) call consequence (to tsv or bcf files) | |||
bcftools csq -f B111.fa -g B111.gff3 snps.vcf -Ot -o snps-csq.tsv | |||
cut -f2,5,6 snps-csq.tsv | tr '|' '\t' | cut -f1-5,7- > snps-csq2.tsv | |||
9. web development | |||
1) orf info from genbank | |||
perl get-orf-by-acc.pl CP021772 > orf.csv | |||
2) | |||
scp weigang@genometracker.org:/mnt/bac_genome/GBS-Wu-Oct-2022/sample.fas.contree . | |||
#biotree -m sample.fas.contree | biotree --ref 'B111' > sample.dnd | |||
#biotree -r 'GBS02' sample.fas.contree | biotree --ref 'B111' > sample.dnd | |||
#biotree -E 0.005 sample.fas.contree | biotree -r 'GBS02' | biotree --ref 'B111' > sample.dnd | |||
biotree -D 90 sample.fas.contree | biotree -r 'GBS02' | biotree --ref 'B111' > tree.dnd | |||
tree visualization by R or PhyloView or R/ggtree | |||
3) | |||
scp weigang@genometracker.org:/mnt/bac_genome/GBS-Wu-Oct-2022/snps-csq2.tsv . | |||
4) pheno.csv | |||
</pre> | |||
== clonal frame pipeline (WGS)== | |||
# Reconstitute whole-genome alignment from vcf; following this post: https://gatk.broadinstitute.org/hc/en-us/community/posts/360070002671-converting-multisample-vcf-to-fasta. Algorithm: split vcf into single-sample vcf's; use "bcftools consensus" to obtain alternate ref seqs. Environment: conda activate ngs (on bioit) | |||
## create one-sample vcf's | |||
cat ../samples-bbss.txt | while read line; do bcftools view -Ov -s $line ../bbss-cp26.vcf.gz > snp5-cp26-$line.vcf; done | |||
## bgzip & tabit | |||
for f in *.vcf; do bgzip $f; tabix $f.gz; done (or bcftools index) | |||
for f in snp5-lp54-*.vcf; do bgzip $f; bcftools index $f.gz; done | |||
## Get consensus | |||
From doc: "Create consensus sequence by applying VCF variants to a reference fasta file. By default, the program will apply all ALT variants to the reference fasta to obtain the consensus sequence. Using the --sample (and, optionally, --haplotype) option will apply genotype (haplotype) calls from FORMAT/GT." | |||
tail -27 ../samples-bbss.txt | while read line; do cat ../../B31-files/cp26_plasmid.fasta | bcftools consensus --sample $line snp5-cp26-$line.vcf.gz | sed "s/>cp26/>$line/" > bbss-cp26-$line.fa; done | |||
## Add reference: | |||
cat ../../B31-files/cp26_plasmid.fasta | sed "s/>cp26/>B31/" > bbss-cp26-B31.fa | |||
concatenate & check length: | |||
cat *.fa > wgs-cp26.fasta | |||
bioseq -l wgs-cp26.fasta | |||
check with bioaln for variability | |||
conda activate gbs | |||
bioaln -i 'fasta' -m wgs-cp26.fasta | less | |||
# run ClonalFrame (conda env "gbs") | |||
srun ClonalFrameML ss-cp26.dnd wgs-cp26.fasta wgs_cp26 -kappa 4.00 -emsim 100 | |||
== Align two assemblies== | |||
# With minimap2: https://github.com/lh3/minimap2 | |||
<code>minimap2 -ax asm5 ref.fa asm.fa > aln.sam # assembly to assembly/ref alignment</code><br> | |||
"For cross-species full-genome alignment, the scoring system needs to be tuned according to the sequence divergence."<br> | |||
"-au options: asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5% sequence divergence" | |||
# With MUMMER4 ([https://mummer4.github.io MUMMER4]): align two genomes, output delta file: | |||
## nucmer -p A909-COH1 COH1.fa A909.fa | |||
## dnadiff -p A909-COH1 -d A909-COH1.delta | |||
## less A909-ref.report, OR <code> grep AvgIdentity A909-ref.report </code> | |||
## output sam file | |||
## nucmer --sam-long=COH1 B111.fa COH1.fa | |||
## samtools view -b COH1.sam -T B111.fa > COH1.bam | |||
## samtools sort COH1.bam -o COH1.sorted.bam | |||
## samtools index COH1.sorted.bam | |||
# Use GSAlign | |||
<pre> | |||
conda config --add channels defaults | |||
conda config --add channels bioconda | |||
conda config --add channels conda-forge | |||
conda install gsalign | |||
gsalign index ref.fas cov | |||
gsalign -i cov -q test.fas | |||
</pre> | |||
# bcf annotate with gff file: | |||
<code>bcftools csq -f ../ensembl-B31-downloads/lp54.fa -g ../ensembl-B31-downloads/lp54.gff3 -Ov -o lp54.anno.vcf lp54.vcf<code> | |||
# variant call with bcftools: https://samtools.github.io/bcftools/howtos/variant-calling.html | |||
# bcftools cheatsheet: https://gist.github.com/elowy01/93922762e131d7abd3c7e8e166a74a0b | |||
=BERT classifier= | |||
* Project start: Winter 2020 | |||
* Project team: Saadmanul Islam <Saadmanul.Islam91@myhunter.cuny.edu> | |||
* Tutorial page: https://medium.com/swlh/a-simple-guide-on-using-bert-for-text-classification-bbf041ac8d04 | |||
* Data set: bioluminascence | |||
* Goal: cross validation | |||
=Estimate LD50 using GLM= | |||
<syntaxhighlight lang='bash'> | |||
## curves | |||
library(dplyr) | |||
library(ggplot2) | |||
library(growthcurver) | |||
library(reshape2) | |||
library(purrr) | |||
setwd("/Users/desireepante/Desktop/Programming") | |||
OD20<-read.csv("OD_0220.csv") | |||
OD220<-OD20[-c(6,11:12,20,26:28, 32:33,45:47),] | |||
od <- filter(OD220, IPTG == 0); | |||
#N15 | |||
od15d <- mutate(od15d, od.norm = OD/max(OD)) | |||
test.1<-glm(min ~ OD, data= od15d, family= "binomial") | |||
ilink<-family(test.1)$linkinv | |||
test.1.pd <- with(od15d, | |||
data.frame(min = seq(min(min), max(min), | |||
length = 10))) | |||
test.1.pd <- cbind(test.1.pd, predict(test.1, test.1.pd, type = "link", se.fit = TRUE)[1:2]) | |||
test.1.pd <- transform(test.1.pd, Fitted = ilink(fit), Upper = ilink(fit + ( se.fit)), | |||
Lower = ilink(fit - ( se.fit))) | |||
test.test<-ggplot(od15d, aes(x = min, y = od.norm)) + | |||
geom_ribbon(data = test.1.pd, aes(ymin = Lower, ymax = Upper, x = min), | |||
fill = "steelblue2", alpha = 0.2, inherit.aes = FALSE) + | |||
geom_line(data = test.1.pd, aes(y = Fitted, x = min)) + | |||
geom_point() | |||
test.test | |||
</syntaxhighlight> | |||
=1k genome project/VCFTools= | |||
<syntaxhighlight lang="bash"> | |||
# Extract APOL1 locus | |||
vcftools --gzvcf ALL.chr22.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz --from-bp 36253071 --to-bp 36267530 --recode --stdout --recode-INFO-all --chr 22 | gzip -c > apol1.vcf.gz | |||
# allele frequencies/counts | |||
vcftools --vcf pvt1.recode.vcf --keep AFR.supergroup --counts --out afr | |||
# pi | |||
# Fst | |||
vcftools --weir-fst-pop AFR.supergroup --weir-fst-pop AMR.supergroup --vcf pvt1.recode.vcf --out AFR-AMR --remove-indels | |||
vcftools --weir-fst-pop AFR.supergroup --weir-fst-pop EAS.supergroup --vcf pvt1.recode.vcf --out AFR-EAS --remove-indels | |||
vcftools --weir-fst-pop AFR.supergroup --weir-fst-pop EUR.supergroup --vcf pvt1.recode.vcf --out AFR-EUR --remove-indels | |||
vcftools --weir-fst-pop AFR.supergroup --weir-fst-pop SAS.supergroup --vcf pvt1.recode.vcf --out AFR-SAS --remove-indels | |||
vcftools --weir-fst-pop AMR.supergroup --weir-fst-pop EAS.supergroup --vcf pvt1.recode.vcf --out AMR-EAS --remove-indels | |||
vcftools --weir-fst-pop AMR.supergroup --weir-fst-pop EUR.supergroup --vcf pvt1.recode.vcf --out AMR-EUR --remove-indels | |||
vcftools --weir-fst-pop AMR.supergroup --weir-fst-pop SAS.supergroup --vcf pvt1.recode.vcf --out AMR-SAS --remove-indels | |||
vcftools --weir-fst-pop EAS.supergroup --weir-fst-pop EUR.supergroup --vcf pvt1.recode.vcf --out EAS-EUR --remove-indels | |||
vcftools --weir-fst-pop EAS.supergroup --weir-fst-pop SAS.supergroup --vcf pvt1.recode.vcf --out EAS-SAS --remove-indels | |||
vcftools --weir-fst-pop EUR.supergroup --weir-fst-pop SAS.supergroup --vcf pvt1.recode.vcf --out EUR-SAS --remove-indels | |||
# concatenate vcf from different chromosomes | |||
vcf-concat apol1-exon-6-snps.vcf tvp23-exon-1-snps.vcf hpr-exon-5-snps.vcf > 3-exons.vcf | |||
# vcf to tabular format | |||
bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT[\t%SAMPLE=%GT]\n' 3-exons.vcf > tmp | |||
vcf-query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT[\t%SAMPLE=%GT]\n' 3-exons.vcf > tmp | |||
</syntaxhighlight> | |||
=Ggplot2 tips= | |||
* Data binning and nice histograms with density and boxplot: | |||
[https://www.jdatalab.com/data_science_and_data_mining/2017/01/30/data-binning-plot.html R binning] | |||
<syntaxhighlight lang="bash"> | |||
# numerical vector as factor so that it is sorted numerically (no need for as.character()) | |||
x.df$train.size <- as.factor(x.df$train.size) | |||
# jitter by group with "position_jitterdodge()" | |||
x.df %>% | |||
ggplot(aes(x=rate.cat, y=accuracy, color=group)) + | |||
geom_boxplot(position=position_dodge(0.8)) + | |||
geom_jitter(position=position_jitterdodge(0.8)) + | |||
facet_wrap(~train.size, nrow = 1) + ylim(0,1) + | |||
geom_abline(intercept = 0.5, slope = 0, linetype="dashed") + | |||
theme(legend.position = "bottom") | |||
# line plot with multiple factors with "interaction()" | |||
rec.df %>% | |||
ggplot(aes(x = tissue, y = rec.rate)) + | |||
geom_boxplot(outlier.shape = NA, aes(fill = tissue), alpha = 0.5) + | |||
geom_point(size = 3, alpha = 0.5, aes(color = symptom)) + | |||
geom_line(aes(group = interaction(id, timepoint)), linetype = 2) + | |||
theme_bw() + | |||
scale_fill_manual(values = c("lightgreen", "lightpink")) + | |||
ylab("Num Rec frags") + | |||
xlab("tissue") + | |||
scale_y_log10() | |||
</syntaxhighlight> | |||
=Capture results in R/GA example= | |||
<syntaxhighlight lang="bash"> | |||
# in general use | |||
lapply(seq_along(x), function(i) {name=names(x}[i]) | |||
# to capture names | |||
# save to a data.frame | |||
# bind into a single data.frame using dplyr function: | |||
x.df <-bind_rows(x.rate) | |||
library(stringdist) | |||
library(ggplot2) | |||
out.fit <- lapply(seq(from = 5,to = 150, by = 5), function (num.epi) { | |||
num.alle <- 20; # num of antigens | |||
epi.fit <- function(x) { # fitness defined by the minimum pair diff | |||
x.list <- split(x, rep(1:num.alle, each=num.epi)) | |||
# sum(seq_distmatrix(x.list, method = "hamming"))/(num.alle * (num.alle-1) / 2)/num.epi # average pairwise diff | |||
min(seq_distmatrix(x.list, method = "hamming"))/num.epi # min diff | |||
} | |||
epi.ga <- ga(type = "binary", fitness = epi.fit, nBits = num.epi * num.alle) | |||
c(all = num.alle, epi = num.epi, min.pair.dist = epi.ga@fitnessValue) | |||
}) | |||
epi.df <- t(as.data.frame(out.fit)) | |||
rownames(epi.df) <- 1:nrow(epi.df) | |||
ggplot(as.data.frame(epi.df), aes(x=epi, y=min.pair.dist)) + geom_point() + geom_line() | |||
</syntaxhighlight> | |||
=NCI Cloud (Seven Bridges)= | |||
* Documentation: http://www.cancergenomicscloud.org/controlled-access-data/ | |||
* [https://docs.cancergenomicscloud.org/blog/programmatically-access-tcga-data-using-the-seven-bridges-cancer-genomics-cloud API Documentation (to access meta-data)] | |||
* Steps (June 20, 2018) | |||
** Create project | |||
** Data -> Data Overview -> Select cancer type -> Data Browser | |||
** Copy files to project | |||
** Go to project, "add filter", filter by "experimental strategy" (RNA-SEQ, miRNA-SEQ) | |||
** To download files: select & Download link; run <code>aria2c -i download-links.txt</code> | |||
** To download meta data: Download manifest | |||
* Steps (better) | |||
# Create project | |||
# Search "Public Apps" | |||
# Load data & Run | |||
* Steps (a little hard to follow) | |||
# Create project | |||
# Data -> Data Overview -> Select cancer type -> Data Browser | |||
# Click File to add property (e.g., "miRNA") -> "Copy Files to Project" -> choose project to add & add a tag to the files | |||
# Go to Project -> Files -> filter files by tag name -> select -> Save | |||
# Choose a public protocol/workflow (e.g., RNA-SEQ alignment) | |||
# Add index and GTF files | |||
# Go to Project -> task -> run | |||
=Reloop a circular plasmid from PacBio assembly= | |||
# Input file: Z9-cp26.fa (containing overlapping ends); b31-cp26.fa (reference) | |||
# Find overlap ends by blasting against itself: blastn -task megablast -query Z9-cp26.fa -subject Z9-cp26.fa -evalue 1e-100 -outfmt 6 | |||
# The result says “1 -7617” is the same as “26489-34118”, so we use bioseq to remove overlapping part: bioseq –s”1,26488” Z9-cp26.fa > Z9-cp26.fa2 | |||
# Run numcer against b31-cp26 to identify the starting position: nucmer --coord ../b31-cp26.fa Z9-cp26.fa2 | |||
# The result says the first base of B31 corresponds to 22171 at Z9-cp26, so we run bioseq to reloop: bioseq –R 22171 Z9-cp26.fa2 > Z9-cp26.fa3 | |||
# Run numcer to confirm: nucmer --coord ../b31-cp26.fa Z9-cp26.fa3 | |||
=KEGG REST API= | |||
<syntaxhighlight lang=python"> | |||
# Author: Edgaras Bezrucenkovas | |||
# Date: Nov 12, 2017 | |||
#KEGG API | |||
#Obtain module pathways and orthologs from KEGG pa14 database | |||
#Import module to fetch the data from KEGG database | |||
import urllib.request | |||
import re #Regular expressions | |||
#Get PA14 modules | |||
kegg_db = "http://rest.kegg.jp/link/module/pau" | |||
pa14_mdata = urllib.request.urlopen(kegg_db).read().decode('utf-8').splitlines() | |||
pa14_mods = [] | |||
for line in pa14_mdata: | |||
line = line.split("\t") | |||
mod = line[1].strip("md:pau_") | |||
pa14_mods.append(mod) | |||
#Get PA14 genes | |||
kegg_db = "http://rest.kegg.jp/link/pau/module" | |||
pa14_gdata = urllib.request.urlopen(kegg_db).read().decode('utf-8').splitlines() | |||
pa14_genes = [] | |||
for line in pa14_gdata: | |||
gdata = line.split("\t") | |||
gdata[0] = gdata[0].strip("md:pau_") | |||
gdata[1] = gdata[1].strip("pau:") | |||
pa14_genes.append(gdata) | |||
#Open the file containing KEGG compounds ids | |||
id_file = open("kegg-entries.txt") | |||
kegg_ids = id_file.readlines() | |||
compound_ids = [] | |||
for cid in kegg_ids: | |||
cid = cid.strip("\n") | |||
compound_ids.append(cid) | |||
id_file.close() | |||
#Get module pathway ids | |||
for cid in compound_ids: | |||
kegg_db = "http://rest.kegg.jp/link/module/compound:"+cid | |||
module_data = urllib.request.urlopen(kegg_db).read().decode('utf-8').splitlines() | |||
module_ids = [] | |||
for line in module_data: | |||
if re.match("cpd", line): | |||
line = line.split("\t") | |||
m_id = line[1].strip("md:") | |||
module_ids.append(m_id) | |||
else: | |||
module_ids.append("NA") | |||
#Filter module pathway ids | |||
modules_final = [] | |||
for m_id in module_ids: | |||
if m_id == "NA": | |||
modules_final.append("NA") | |||
elif m_id in pa14_mods: | |||
modules_final.append(m_id) | |||
if not modules_final: | |||
modules_final.append("NA") | |||
#Get genes | |||
genes = {} | |||
for m_id in modules_final: | |||
key = m_id | |||
genes.setdefault(key, []) | |||
if m_id == "NA": | |||
genes[key].append("NA") | |||
else: | |||
for i in pa14_genes: | |||
if m_id == i[0]: | |||
genes[key].append(i[1]) | |||
#Get module names using module ids | |||
module_names = {} | |||
for m_id in modules_final: | |||
if m_id == "NA": | |||
m_name = {"NA":"NA"} | |||
module_names.update(m_name) | |||
else: | |||
kegg_db = "http://rest.kegg.jp/list/module:"+m_id | |||
module_data = urllib.request.urlopen(kegg_db).read().decode("utf-8").split("\t") | |||
module_name = module_data[1].strip("\n") | |||
m = {m_id:module_name} | |||
module_names.update(m) | |||
#Output | |||
for k, v in genes.items(): | |||
for i in v: | |||
print(cid,k,i,module_names.get(k), sep = "\t") | |||
</syntaxhighlight> | |||
=D3js Tutorial= | |||
# Install brackets editor from adobe: http://brackets.io/ | |||
# Enable web server & create a directory "public_html" in your home directory | |||
# Create the following file structure: test/index.html; test/js/index.js; test/css/index.css | |||
# Basic D3: following this link | |||
# Metabolomics: following this link | |||
=Transform a wide data frame to a long one= | |||
* A function in R | |||
<syntaxhighlight lang="bash"> | |||
df2long = function(x, varname, cname, rname){ | |||
num.col = dim(x)[2] | |||
x$tmp = rownames(x) | |||
y = reshape(x, varying=1:num.col, v.names=varname, timevar=cname, times=names(x)[1:num.col], direction="long") | |||
y=y[,-4] | |||
rownames(y)=NULL | |||
colnames(y)[1] = rname | |||
y | |||
} | |||
</syntaxhighlight> | |||
* A function in Perl | |||
<syntaxhighlight lang="perl"> | |||
#!/usr/bin/env perl | |||
# expect a data frame with row and col headings | |||
use strict; | |||
use warnings; | |||
use Data::Dumper; | |||
my $first = 0; | |||
my @colnames; | |||
my %data; | |||
while(<>) { | |||
chomp; | |||
$first++; | |||
if ($first == 1) { | |||
@colnames = split; | |||
next; | |||
} | |||
my @data = split; | |||
my $rowname = shift @data; | |||
for (my $i=0; $i<=$#data; $i++) { | |||
$data{$rowname}->{$colnames[$i]} = $data[$i]; | |||
} | |||
} | |||
#print Dumper(\%data); | |||
foreach my $row (keys %data) { | |||
foreach my $col (@colnames) { | |||
print $row, "\t", $col, "\t", $data{$row}->{$col}, "\n"; | |||
} | |||
} | |||
exit; | |||
</syntaxhighlight> | |||
=Variant call with samtools/bcftools & variant verification using IGV= | |||
# mpileup: <code>samtools mpileup -uf ref.fa sorted1.bam sorted2.bam ... </code> | |||
# [https://samtools.github.io/bcftools/howtos/variant-calling.html call variants]: <code>bcftools call -mv > var.raw.vcf </code> OR: <code>bcftools call -c -v --ploidy 1 gbs-50.mpileup > raw.vcf </code> | |||
# extract only SNP sites: <code>vcftools --vcf input_file.vcf --remove-indels --recode --recode-INFO-all </code> | |||
# filter by quality: <code>bcftools filter -i '%QUAL>200' gbs50-snps.recode.vcf > gbs50-snps2.vcf </code> | |||
# check by TsTv: <code>bcftools stats gbs50-snps2.vcf | grep TSTV </code> | |||
# Extract SNPs into Fasta: | |||
## Or, with bcftools: <code>bcftools query -l gbs50-snps2.vcf > samples</code> | |||
## <code>cat samples | while read line; do echo ">$line"; bcftools query -s "$line" -f '[%TGT]' gbs50-snps2.vcf; echo; done > samples.fas </code> | |||
## get ref seq: <code>echo ">ref" >> sample.fas; grep "^CP" gbs50-snps2.vcf | cut -f4 | paste -s -d '' >> sample.fas</code> | |||
# Visualization with IGB | |||
## Prepare & load reference genomes | |||
## Load FASTA: <code>bioseq -i'genbank' ref.gb > ref.fas</code> | |||
## Prepare GFF3 file: <code>bp_genbank2gff3.pl --CDS --filter exon --filter gene /home/chongdi/michelle/pa_rp73.gb</code> | |||
# Prepare BAM files (see below on how to index and align reads using bwa and samtools, in "ospC Amplicon") | |||
# Index sorted bam files with samtools & load BAM files (multiple bam files okay) | |||
=KRAKEN= | |||
# assigns taxonomic labels to short DNA reads by examining the k-mers within a read and querying a database with those k-mers: <code>kraken -db /belfer-ebox/projects/old_backup/qiulab/minikraken_20141208 --fastq-input 02015P1_S18_L001_R1_001.fastq.gz 02015P1_S18_L001_R2_001.fastq.gz --gzip-compressed --output outfile --paired </code> | |||
# summarize taxonomy: <code> kraken-translate --db /belfer-ebox/projects/old_backup/qiulab/minikraken_20141208 outfile > predict.file </code> | |||
# count unique strains: <code>pat-8]$ cut -f2 s75.predict | cut -f10 -d';' | grep "^[a-zA-Z]" | sort | uniq -c | sort -rn</code>. | |||
# Pick the strain with the highest counts as the reference genome | |||
=PATRIC microbial genome annotation CLI= | |||
* Install Debian distribution from github: https://github.com/PATRIC3/PATRIC-distribution/releases/tag/1.015 | |||
* Tutorial: http://tutorial.theseed.org/PATRIC-Tutorials/ | |||
* Fetch all genomes from a genus: <code>p3-all-genomes --in genome_name,Borrelia > Borrelia.genomes</code>; <code>p3-get-genome-data -i Borrelia.genomes -e genome_status,complete > Borrelia-complete.genomes</code> | |||
* Retrieves features from coding sequences only: input file must be a list of genome ids with a header! <code>p3-get-genome-features -i $file -e feature.feature_type,CDS -a pgfam_id -a patric_id -a plfam_id -a start -a end -a strand -a genome_name -a product -a accession > ${name}-all-features.txt </code> | |||
* Retrieve feature sequence: <code>p3-echo -t feature.accession "fig|100177.4.peg.1" | p3-get-feature-sequence --dna</code> | |||
* Submit genomes for annotation | |||
# Log in: p3-login user name/password | |||
# Submit a genome for annotation: | |||
## <code>submit-patric-annotation --scientific-name "Pseudomonas aeruginosa" --taxonomy-id 287 --genetic-code 11 --domain Bacteria --log pa.log /weigang@patricbrc.org/home/wsdir gid_5.fas</code> | |||
## <code>submit-patric-annotation --scientific-name "Streptococcus agalactiae" --taxonomy-id 1311 --genetic-code 11 --domain Bacteria --log sa-1.log /weigang@patricbrc.org/home/wsdir sa-1.fas</code> | |||
## Message: "Uploading gid_5.fas to workspace at /weigang@patricbrc.org/home/wsdir/gid_5/gid_5.fas” | |||
# Create genome groups | |||
## login | |||
## <code>cat trep.complete.ids | p3-put-genome-group "Treponema complete”</code> | |||
## <code>p3-get-genome-group "Treponema complete”</code> | |||
=Parsimony reconstruction using MPR= | |||
<syntaxhighlight lang=R"> | |||
# MPR for each column | |||
plot.mpr <- function(column=1) { | |||
plot.phylo(tr.unrooted, main = paste(colnames(ph.t)[column])) | |||
tmpr<-MPR(ph.t[,column], tr.unrooted, outgroup = "gid_1311.1320") | |||
nodelabels(paste("[", tmpr[, 1], ",", tmpr[, 2], "]", sep = "")) | |||
tiplabels(ph.t[,column][tr.unrooted$tip.label], adj = -2) | |||
} | |||
# add internal node states | |||
datalist <- data.frame(fam=character(), a=numeric(), b=numeric(), c=numeric()) | |||
for (i in 1:ncol(ph.t)) { | |||
#for (i in 1:10) { | |||
tmpr<-MPR(ph.t[,i], tr.unrooted, outgroup = "gid_1311.1320"); | |||
out<-c(colnames(ph.t)[i],tmpr[,1]) | |||
datalist <- rbind(datalist, data.frame(fam=colnames(ph.t)[i], a=tmpr[1,1], b=tmpr[2,1], c=tmpr[3,1])) | |||
} | |||
#plotting gains and losses | |||
tr.unrooted <- unroot(tr) | |||
gains<-apply(ph.node.states[,9:15], 2, function(x) length(which(x>0))) | |||
losses<-apply(ph.node.states[,9:15], 2, function(x) length(which(x<0))) | |||
plot(tr.unrooted) | |||
edgelabels(gains, adj = c(0.5,-0.25), col=2, frame= "none") | |||
edgelabels(losses, adj = c(0.5,1.25), col=4, frame= "none") | |||
</syntaxhighlight> | |||
=ospC amplicon identification= | |||
<syntaxhighlight lang="bash"> | |||
#Index nucleotide file: | |||
bwa index ref.fa | |||
#align: | |||
bwa mem ref.fa sample_read1.fq sample_read2.fq > sample.sam | |||
#BAM output: | |||
samtools view -b sample.sam > sample.bam | |||
#Sort & index BAM file: | |||
samtools sort -o sample.sorted sample.bam | |||
samtools index sample.sorted | |||
# Calculate coverage by bedtools (better, raw read coverages) | |||
Based on the update: https://bioinformatics.stackexchange.com/questions/2318/how-to-count-reads-in-bam-per-bed-interval-with-bedtools | |||
bedtools bamtobed -i sample.sorted > sample.bed # bam to bed | |||
bedtools coverage -b sample.bed -a ospC.bed > sample.coverage | |||
Alternatives (not good): | |||
bedtools coverage -b sample.sorted.bam -a refs.bed -d > sample.bedtools.cov # per-site | |||
bedtools coverage -b sample.sorted.bam -a refs.bed -counts > sample.bedtools.cov # average | |||
# Generate VCF file (not needed for coverage) | |||
samtools mpileup -uf ref.fa s18.sorted.bam s56.sorted.bam > test.mpileup | |||
bcftools call -c -v test.mpileup > test.raw.vcf | |||
#R plot | |||
</syntaxhighlight> | |||
<syntaxhighlight lang="bash"> | |||
# BASH script from Tony Fung & Jamie Cabigas (Spring 2019) | |||
#!/usr/bin/env bash | |||
# ---------------------------------------- | |||
# File : fas-to-vcf.sh | |||
# Author : Tony Fung & Jamie Cabigas | |||
# Date : March 21, 2019 | |||
# Description : Batch convert FAS files to VCF files | |||
# Requirements : samtools 1.9, bcftools 1.9, bwa 0.7.12 | |||
# Input : reference file, format of sequence files, up to 7 sequence id's | |||
# Sample Command : bash fas-to-vcf.sh ref.fasta fastq ERR221622 ERR221661 ERR221662 ERR221663 | |||
# Output : up to 7 VCF files | |||
# ---------------------------------------- | |||
if [ $# -lt 3 ] | |||
then | |||
echo "Usage: bash $0 <ref_file> <format_of_sequence_files> <sequence_id> [<sequence_id_2> <sequence_id_3> etc.]"; | |||
exit; | |||
fi | |||
for arg | |||
do | |||
if [ $arg == $1 ] | |||
then | |||
ref_file="../$1"; | |||
elif [ $arg == $2 ] | |||
then | |||
file_format=$2; | |||
else | |||
seq_id=$arg; | |||
fas_file_1="${seq_id}_1.$file_format"; | |||
fas_file_2="${seq_id}_2.$file_format"; | |||
pwd; | |||
cd $seq_id; | |||
pwd; | |||
# Index nucleotide file: | |||
bwa index $ref_file; | |||
# Align: | |||
bwa mem $ref_file $fas_file_1 $fas_file_2 > $seq_id.sam | |||
# BAM output: | |||
samtools view -b $seq_id.sam > $seq_id.bam | |||
# Sort & index BAM file: | |||
samtools sort $seq_id.bam -o $seq_id.sorted.bam | |||
samtools index $seq_id.sorted.bam | |||
# Generate VCF file: | |||
# -u is deprecated, use bcftools mpileup instead | |||
# samtools mpileup -uf $ref_file $seq_id.sorted.bam > $seq_id.mpileup | |||
bcftools mpileup -f $ref_file $seq_id.sorted.bam > $seq_id.mpileup | |||
bcftools call -c -v $seq_id.mpileup > $seq_id.raw.vcf | |||
cd ..; | |||
pwd; | |||
fi | |||
done | |||
</syntaxhighlight> | |||
=Running a Screen Session= | |||
==use byobu== | |||
# start a screen session by typing "byobu" | |||
# run commands | |||
# Detach by pressing "F6" | |||
# Reattach by typing "byobu" | |||
# Terminate by typing "exit" | |||
==use screen== | |||
* Start a screen session | |||
<div class="toccolours mw-collapsible"> | |||
<syntaxhighlight lang=bash"> | |||
screen | |||
</syntaxhighlight> | |||
</div> | |||
* Detach the running mission | |||
<div class="toccolours mw-collapsible"> | |||
<syntaxhighlight lang=bash"> | |||
ctrl + A + D | |||
</syntaxhighlight> | |||
</div> | |||
* Show the list of running process | |||
<div class="toccolours mw-collapsible"> | |||
<syntaxhighlight lang=bash"> | |||
screen -ls | |||
</syntaxhighlight> | |||
</div> | |||
* Reattach a running process | |||
<div class="toccolours mw-collapsible"> | |||
<syntaxhighlight lang=bash"> | |||
screen -r ProcessID | |||
</syntaxhighlight> | |||
</div> | |||
* Terminate a process | |||
<div class="toccolours mw-collapsible"> | |||
<syntaxhighlight lang=bash"> | |||
ctrl + A | |||
:quit | |||
</syntaxhighlight> | |||
</div> | |||
=Bp-utils: sequence, alignment & tree utilities by Qiu Lab= | =Bp-utils: sequence, alignment & tree utilities by Qiu Lab= | ||
==bioseq: sequence/FASTA manipulations== | ==bioseq: sequence/FASTA manipulations== | ||
Line 137: | Line 1,084: | ||
==BLAST+: search("google") for homologs/pariwise alignment== | ==BLAST+: search("google") for homologs/pariwise alignment== | ||
==hmmer== | ==hmmer== | ||
* link: http://hmmer.org/ | |||
* Installation: <code>% conda install -c biocore hmmer </code> # Anaconda | |||
* Build profile: <code>hmmbuild globins4.hmm globins4.sto</code> | |||
* Search: <code>hmmsearch globins4.hmm uniprot_sprot.fasta > globins4.out</code> | |||
==cdhit== | ==cdhit== | ||
<syntaxhighlight lang="bash"> | <syntaxhighlight lang="bash"> | ||
Line 155: | Line 1,107: | ||
=Programs for producing multiple alignments= | =Programs for producing multiple alignments= | ||
==MUSCLE== | ==MUSCLE== | ||
==MUGSY== | |||
* MUGSY bash | |||
<syntaxhighlight lang="bash"> | |||
#!/bin/sh | |||
export MUGSY_INSTALL=/home/weigang/mugsy_x86-64-v1r2.2 | |||
export PATH=$PATH:$MUGSY_INSTALL:$MUGSY_INSTALL/mapping | |||
export PERL5LIB=$MUGSY_INSTALL/perllibs | |||
#For testing TBA | |||
#export PATH=$PATH:$MUGSY_INSTALL/../../multiz-tba/trunk/ | |||
</syntaxhighlight> | |||
* source the bash file | |||
<syntaxhighlight lang="bash"> | |||
source mugsyenv.sh | |||
</syntaxhighlight> | |||
*run mugsy | |||
<syntaxhighlight lang="bash"> | |||
mugsy --directory /home/chongdi/Streptococcus/mugsy-output -prefix mugsy_aln mugsy-input/*.fa | |||
</syntaxhighlight> | |||
==CLUSTALW== | ==CLUSTALW== | ||
==MAFT== | ==MAFT== | ||
Line 160: | Line 1,134: | ||
=Programs for producing phylogeny & phylogenetic analysis= | =Programs for producing phylogeny & phylogenetic analysis= | ||
==IQ-Tree== | |||
* [http://www.iqtree.org/doc/Tutorial Beginner's tutorial] | |||
* [http://www.iqtree.org/doc/Advanced-Tutorial Advanced tutorial] | |||
* Model search (slow!! better done with byobu) | |||
** <code>iqtree -s example.phy -m MFP</code> | |||
* protein tree with fast bootstrap: | |||
** <code>iqtree -s example.phy -m WAG+I+G -bb 1000</code> (version 1.xx; -B for latest version) | |||
* site-specific rates: | |||
** <code>iqtree -s example.phy -rate</code> | |||
** <code>iqtree -s example.phy -wsr</code> (version 1.xx; -t and -m from above to save time) | |||
* ancestral state reconstruction: | |||
** <code>iqtree -s example.phy -asr</code> (-t and -m from above to save time) | |||
==FastTree== | ==FastTree== | ||
==PHYLIP== | ==PHYLIP== | ||
==MrBayes== | ==MrBayes== | ||
==RaXML== | ==RaXML== | ||
* Required arguments | |||
** -s alignment (in PHYLIP or FASTA) | |||
** -n tag | |||
* Simple run (ML): <code>raxmlHPC-SSE3 -x 12345 -p 12345 -# autoMRE -s concat.fas -m GTRGAMMA -n tag -q part.txt</code> | |||
* Bootstrap: <code>raxmlHPC-SSE3 -f a -x 12345 -p 12345 -# 100 -s concat.fas -m GTRCAT -n tag -q part.txt</code> | |||
* Make a file named "part.txt" with the following lines (Chanlge the number to the total length of your alignment): | |||
<pre> | |||
DNA, gene1codon1 = 1-3765906\3 | |||
DNA, gene1codon2 = 2-3765906\3 | |||
DNA, gene1codon3 = 3-3765906\3 | |||
</pre> | |||
* The resulting files | |||
** RAxML_bestTree.tag: best tree (no bootstrap) | |||
** RAxML_bipartitionsBranchLabels.tag: ignore | |||
** RAxML_bipartitions.tag: main result. Feed this tree to figtree | |||
** RAxML_bootstrap.tag: ignore | |||
** RAxML_info.tag : log file | |||
* Protein models | |||
** <code>raxmlHPC-SSE3 -s protein.phy -n A1 -m PROTGAMMAWAS</code> # protein, gamma, Whelan & Goldman (2001) model | |||
** <code>raxmlHPC-SSE3 -s protein.phy -n A2 -m PROTGAMMAGTR</code> # protein, gamma, user model | |||
** <code>raxmlHPC-SSE3 -s protein.phy -n A1 -m PROTGAMMAWAG -# 100 -p 0123 </code> # protein, gamma, Whelan & Goldman (2001) model, bootstrap | |||
** <code>raxmlHPC-SSE3 -s protein.phy -n A1 -m PROTGAMMAWAG -o Carp,Loach</code> # protein, gamma, Whelan & Goldman (2001) model, root on a (monophyletic group) | |||
** <code>raxmlHPC-SSE3 -s protein.phy -n A1 -m PROTGAMMAWAS -# autoFC -p 0123 </code> # protein, gamma, Whelan & Goldman (2001) model, bootstrap (at least 99 splits, auto-stopping) | |||
** <code>raxmlHPC-SSE3 -f a -s protein.phy -n A1 -m PROTGAMMAWAG -# 100 -p 0123 -x 0123 </code> # Rapid bootstrap with consensus | |||
*** output 1, RAXML_bestTree.A1 (ml tree) | |||
*** output 2, RAXMLbootstrap.A1 (bootstrap relicates) | |||
*** output 3, RAXMLbipartitions.A1 (tree with boot strap values) | |||
==PhyloNet== | ==PhyloNet== | ||
=R packages for phylogenetics= | =R packages for phylogenetics= | ||
==APE== | ==APE== | ||
===Convert a tree to ultrametric using time estimates=== | |||
<code> | |||
t <- read.tree("core-tree-30otus.dnd") | |||
t.ultra <- chronopl(t, 0) | |||
t.hclust <- as.hclust(t.ultra) | |||
t.dendro <- as.dendrogram(t.hclust) | |||
heatmap(ge.var, Rowv=t.dendro) # order the rows with customized tree | |||
</code> | |||
==phengorn== | ==phengorn== | ||
==phytools== | ==phytools== | ||
=Population genetics= | =Population genetics= | ||
==LDHat: test of recombination based on 4-gametes== | |||
* filter sites: <code>convert -seq snps.sites -loc snps.locs -freqcut 0.08 </code> | |||
* Pairwise LD tests: First generate "lookup" table: <code>lkgen -lk /usr/local/LDhat/lk_files/lk_n50_t0.001 -nseq 48</code>; then calculate pairwise LD statistics: <code>pairwise -seq sites.txt -loc locs.txt -lk new_lk.txt</code> | |||
* Run interval for recombination hot spots: <code>interval -seq sites.txt -loc locs.txt -lk new_lk.txt -its 1000000 -bgen 5 -exact</code> | |||
* Get stats: <code> stat -input rates.txt -loc locs.txt -burn 50 (burnin=100K)</code> | |||
* Plot in R: | |||
<syntaxhighlight lang = "bash"> | |||
x = read.table("rates.txt", skip=1, fill=T); | |||
x = as.matrix(x); | |||
burn.in = 50; | |||
low = as.integer(nrow(x)*burn.in/100); | |||
means<-apply(x[low:nrow(x),], 2, mean, na.rm=T); | |||
q.95<-apply(x[low:nrow(x),], 2, quantile, probs=c(0.025, 0.5, 0.975), na.rm=T); | |||
pos<-as.vector(as.matrix(read.table("locs.txt", as.is=T, skip=1))); | |||
plot(pos[1:(length(pos)-1)], y=means[2:length(means)], type="s", col=rgb(0,0,0.5), | |||
xlab="SNP position (B111)", ylab="Posterior mean recombination rate", main="GBS Strep: recombination by LDhat"); | |||
lines(pos[1:(length(pos)-1)], y=q.95[1,2:length(means)], type="s", col=grey(0.75), lty="dotted"); | |||
lines(pos[1:(length(pos)-1)], y=q.95[3,2:length(means)], type="s", col=grey(0.75), lty="dotted"); | |||
</syntaxhighlight> | |||
==ms: coalescence simulation== | ==ms: coalescence simulation== | ||
==SFS: forward simulation== | ==SFS: forward simulation== | ||
Line 180: | Line 1,228: | ||
==pa2== | ==pa2== | ||
==spiro_genomes/treponema== | ==spiro_genomes/treponema== | ||
==Genome | ==Blast a set of genes against a bacteria genome== | ||
# download genome | |||
# extract gene sequences & translate | |||
# Make blast database | |||
# Run blastp | |||
==''de novo'' variant call with cortex_var== | |||
Create binary file of fasta genome file. | |||
Run contex_var_31_c1 (cutoff 1 used for 1 genome) | |||
*--''se_list'' is the command the reads the list you want to target (ie: list-genome.txt) | |||
*--''kmer_size'' is the middle size, has to be an odd integer | |||
*--''mem_width'' always choose 17 | |||
*--''mem_height'' always choose 100 | |||
*--''dump_binary'' Name your file name (ie: Genome.ctx) | |||
<syntaxhighlight lang="bash"> | |||
/home/weigang/CORTEX_release_v1.0.5.21/bin/cortex_var_31_c1 | |||
--se_list list-Evo.txt | |||
--kmer_size 31 | |||
--mem_width 17 | |||
--mem_height 100 | |||
--dump_binary Evo.ctx | |||
> Evo.log save log file | |||
</syntaxhighlight> | |||
Read each binary file (.ctx) into its own individual color list (ls Evo.ctx > Evo.colorlist) | |||
Then save these lists into their own collective colorlist.txt (ls *.ctx > colorlist.txt) | |||
Reveal genetic variation using the Bubble Caller from cortex_var. | |||
<syntaxhighlight lang="bash"> | |||
/home/weigang/CORTEX_release_v1.0.5.21/bin/cortex_var_31_c5 | |||
--se_list colorlist.txt | |||
--kmer_size 31 | |||
--mem_width 17 | |||
--mem_height 100 | |||
--dump_binary all-colors.ctx | |||
> all-colors.log save log file | |||
</syntaxhighlight> | |||
Bubble caller will detect differences between each genome by assigning distinct colors to each genome (note that the UK spelling of color is used: '''colour''') | |||
*--''multicolour_bin'' holds your all-colors.ctx binary from the Bubble Caller | |||
*--''detect_bubbles1 i/i'' Detects 1 variation between genomes ''i'' and ''i''. ''i'' indicates the position number the genome is listed on the colorlist.txt file. If the genome is fourth on the colorlist.txt, for example, its corresponding ''i'' variable is 4 | |||
*--''output_bubbles1'' Output variant reads in fasta format (ie: Evo-RefHG.var for bubble detection between | |||
Evolved genome and Reference HG genome) | |||
*--''print_colour_coverages'' necessary for output | |||
<syntaxhighlight lang="bash"> | |||
/home/weigang/CORTEX_release_v1.0.5.21/bin/cortex_var_31_c5 | |||
--kmer_size 31 | |||
--mem_height 17 | |||
--mem_width 100 | |||
--multicolour_bin all-colors.ctx | |||
--detect_bubbles1 0/1 | |||
--output_bubbles1 Evo-RefHG.var | |||
--print_colour_coverages | |||
> Evo-RefHG.log save log file | |||
</syntaxhighlight> | |||
=Variant call with cortex_var= | |||
==Example== | |||
* Files | |||
<syntaxhighlight> | |||
MS00018_S5_L001_R1_001.fastq | |||
MS00018_S5_L001_R2_001.fastq | |||
KU-090401_S3_L001_R1_001.fastq | |||
KU-090401_S3_L001_R2_001.fastq | |||
... | |||
</syntaxhighlight> | |||
* File content | |||
<syntaxhighlight> | |||
@M03268:52:000000000-AJFAY:1:1101:16970:1555 1:N:0:7 | |||
CCCATGAACGGCACGTTCACGATGCAGAAGGTGGTGACCAGGCCGGTGTCGGCGACCTCGACGTAATCGTCGGTGGGGGA | |||
GCCGTCGCGCGGGTCCGCGCCGCGCGGCGGGAAGTACACCTTGCCGTCCATGCGGGCGCGGGCGCCCAGCAGACGACCCT | |||
CCATGAGCGCATTCAGGAATTCGGCCTCCTGCGGCGCGGCGGTGTGCTTGATATTGAAGTCGACCGGGGTCACGATGCCG | |||
GTGACCGGTT | |||
+ | |||
11>1>111@11>1A1FGF1FC0F0A111010GB0ECBFGF0/AFECGGFHECE??EEGHE/EEEHEFHEHHGCEECE??/ | |||
<CFCGGCCGCCCCCCHCGGCCCG@G??@@?@-@BFFFFFFFFFFF;B@FBFBFF<?@;@@-=?@??@-EFFFFF@;@@@F | |||
FFBF/BBF;@@FFFFFFFFFFFFF@FFFFFFF<@@@@@@@@@@@FBFFFFFFFFFFFFFFFFF@@@?@?@FFFFFFFBF@ | |||
?@@EFF@@<B | |||
@M03268:52:000000000-AJFAY:1:1101:16136:1618 1:N:0:7 | |||
TCCTGGCCCGTGAAACCGCTTGCCCGGTACAGGTTCTGGACTACCGCCTGGCACCCGAGCATCCGTTCCCGGCGGCGCTC | |||
GACGACGCCGAGGCGGCGTTCCTGGAACTGGTGGCCGCCGGCTACCGGCCCGAACGCATTGCGGTCGCGGGTGATTCGGCCGGTGGCGGGCTCTCGCTCGCGCTGGCCGAACGGCTGCGCGACCGGCACGGGCTGGTTCCGGCCGCGCTCGGGCTGATCGCGCCCTGGGC | |||
+ | |||
11>>11C1A@A?11BDF?EEGAFGFG?ECCHBHHGFG1EGHFHHHCCGGC/GCGGGCEECEFGHGGFFGHGGGGCECCC<CCC@CCCC@CGCCCGGC?:@EBFBF/CFFFF0/CFG?=@=-9>AFF@=@@?@;@@@F--9:BF@A@@?@--;9-BFFFF@A-@99B?-9@?=EFFBAAE;9>?;@@BF@9-@-;@=-E@@@=@@;@?>@9@<?@?BBBFFF;?>;;-@@?@<?@?9-FBFF@-99-9E-9 | |||
... | |||
</syntaxhighlight> | |||
==Step 1== | |||
* Create matched FASTQ files (python script) | |||
<syntaxhighlight lang="python"> | |||
#!/usr/bin/python | |||
from sys import argv | |||
script, File1, File2 = argv | |||
# Create a dictionary listing the sequences in the first file for reference | |||
file1 = open(File1) | |||
dict1 = {} | |||
for line in file1: | |||
if '@M03268' in line: | |||
tag1 = line.rstrip()[:-9] | |||
tail = line.rstrip()[-9:] | |||
dict1[tag1] = [] | |||
else: | |||
dict1[tag1].append(line.rstrip()) | |||
file1.close() | |||
# Create two output files | |||
f1 = open(File1.replace('.fastq', '_mat.fq'), 'w') | |||
f2 = open(File2.replace('.fastq', '_mat.fq'), 'w') | |||
# Match the sequence | |||
file2 = open(File2) | |||
for line in file2: | |||
if dict1.has_key(line.rstrip()[:-9]): # The has_key method | |||
tag1 = line.rstrip()[:-9] | |||
f1.write(tag1 + tail + '\n') | |||
for j in range(3): | |||
f1.write(dict1[tag1][j] + '\n') | |||
del dict1[tag1] | |||
dict2 = {} # Create a temporary dictionary for sequence in the file2 | |||
tag2 = line.rstrip() | |||
dict2[tag2] = [] | |||
else: | |||
dict2[tag2].append(line.rstrip()) | |||
if len(dict2[tag2]) == 3: | |||
f2.write(tag2 + '\n') | |||
for j in range(3): | |||
f2.write(dict2[tag2][j] + '\n') | |||
file2.close() | |||
f1.close() | |||
f2.close() | |||
</syntaxhighlight> | |||
==Step 2. Clean Fastq Files & Run Single-color Graph & Error Cleaning== | |||
* Create a file list showing all outcome files whose extensions need to be changed from _mat.fq to .list | |||
<syntaxhighlight lang="bash"> | |||
bbduk.sh -Xmx1g in1=fastq_file1 in2=fastq_file2 -out1=clean1.fq -out2=clean2.fq qtrim=rl trimq=20 | |||
</syntaxhighlight> | |||
* Create a file list showing all outcome files whose extensions need to be changed from _mat.fq to .list | |||
<syntaxhighlight lang="bash"> | |||
for f in *_mat.fq; | |||
do | |||
title=$(echo $f | cut -d'_' -f2); | |||
id=$(echo $f | cut -d'_' -f1); | |||
echo $f > ${id}.list${title}; | |||
done | |||
</syntaxhighlight> | |||
* Single color graph for sample | |||
<syntaxhighlight lang="bash"> | |||
../../CORTEX_release_v1.0.5.21/bin/cortex_var_31_c1 | |||
--pe_list MS00018_S5.list1,MS00018_S5.list2 | |||
--kmer_size 31 | |||
--mem_height 17 | |||
--mem_width 100 | |||
--dump_binary MS00018_S5.ctx | |||
--sample_id MS00018_S5 | |||
--remove_pcr_duplicates | |||
--quality_score_threshold 20 > MS00018_S5.log | |||
</syntaxhighlight> | |||
* Single color graph for reference | |||
<syntaxhighlight lang="bash"> | |||
../../CORTEX_release_v1.0.5.21/bin/cortex_var_31_c1 | |||
--se_list ref.filelist | |||
--kmer_size 31 | |||
--mem_height 17 | |||
--mem_width 100 | |||
--dump_binary ref.ctx | |||
--sample_id ref 20 > ref.log | |||
</syntaxhighlight> | |||
* Run Error Cleaning for All Samples (reference is not included) | |||
<syntaxhighlight lang="bash"> | |||
../../CORTEX_release_v1.0.5.21/bin/cortex_var_31_c1 | |||
--mem_height 18 | |||
--mem_width 100 | |||
--kmer_size 31 | |||
--multicolour_bin N18_S15.ctx | |||
--remove_low_coverage_supernodes 10 | |||
--dump_binary N18_S15.cleaned.ctx | |||
</syntaxhighlight> | |||
==Step 3== | |||
* Pull the name of each .cleaned.ctx file to a cleaned.list file, then create a .filelist file for all cleaned.list files. | |||
<syntaxhighlight lang="bash"> | |||
ls file1.cleaned.ctx > file1.cleaned.list | |||
ls file2.cleaned.ctx > file2.cleaned.list | |||
ls ref.ctx > ref.list | |||
ls -1 *.list > ref-sample.filelist | |||
</syntaxhighlight> | |||
* Multicolour Graph | |||
<syntaxhighlight lang="bash"> | |||
../../CORTEX_release_v1.0.5.21/bin/cortex_var_31_c3 | |||
--mem_height 20 | |||
--mem_width 100 | |||
--kmer_size 31 | |||
--colour_list ref-sample.filelist | |||
--dump_binary ref-sample.ctx > ref-sample.log | |||
</syntaxhighlight> | |||
==Step 4== | |||
* Variation Discovery Using The Bubble Caller | |||
<syntaxhighlight lang="bash"> | |||
../../CORTEX_release_v1.0.5.21/bin/cortex_var_31_c3 | |||
--mem_height 20 | |||
--mem_width 100 | |||
--kmer_size 31 | |||
--multicolour_bin ref-sample.ctx | |||
--detect_bubbles1 -1/-1 | |||
--ref_colour 2 | |||
--output_bubbles1 bubbles-in-sample.out | |||
--print_colour_coverages | |||
--experiment_type EachColourAHaploidSampleExceptTheRefColour | |||
--genome_size 8000000 > bubbles-in-sample.log | |||
</syntaxhighlight> | |||
==Step 5== | |||
* Reference Genome | |||
(When in Cluster execute "module load stampy": doesn't work; path problem) | |||
(Run the following on wallace:) | |||
<syntaxhighlight lang="bash"> | |||
stampy.py -G ref ref.fa | |||
stampy.py -g ref -H ref | |||
</syntaxhighlight> | |||
* Turn Into VCF with reference | |||
Make a sample list file (from bubble or multicolor log file): | |||
<code>cat e1-bubbles-in-sample.log | grep CLEANED | cut -f2 > e1.sample.lis</code> | |||
Customize the following command based on your output files, num of colors, index of ref colors, etc | |||
<code> | |||
perl /home/weigang/CORTEX_release_v1.0.5.21/scripts/analyse_variants/process_calls-wq.pl --callfile e1-bubbles-in-sample.out --callfile_log e1-bubbles-in-sample.log -outvcf e1-bubbles-in-sample --outdir e1-vcfout --samplename_list e1.sample.list --num_cols 7 --stampy_bin /home/weigang/stampy-1.0.28/stampy.py --stampy_hash ref --refcol 6 --vcftools_dir /usr/local/bin --caller BC --kmer 31 --ploidy 1 | |||
</code> | |||
==Step 6. Parse VCF files== | |||
# Filter out low-quality sites: | |||
<code>vcftools --vcf pat-5.decomp.vcf --keep-filtered PASS --recode --out pat-5</code> | |||
# Extract coverage | |||
<code>vcftools --vcf pat-5.recode.vcf --extract-FORMAT-info COV </code> | |||
(output file: out.COV.FORMAT) | |||
# Extract genotypes | |||
<code>vcftools --vcf pat-5.recode.vcf --extract-FORMAT-info GT </code> | |||
(output file: out.GT.FORMAT) | |||
# Extract confidence | |||
<code>vcftools --vcf pat-5.recode.vcf --extract-FORMAT-info GT_CONF</code> | |||
(output file: out.GT_CONF.FORMAT) | |||
# Send GT_CONF file to R and visualize log10(conf) distribution with boxplot | |||
# Use custom PERL file to filter out low-quality (e.g., GT_CONF < 30) genotype calls (flag with "?" or "na"), and make haplotype GT ("1/1" to "1", "0/0" to "0", "./." to "?) | |||
# Verify variants using IGV (see IGV protocol above) | |||
==Step 7. Variant Annotation & Visualization== | |||
# <code>vcf_parser.py out.decomp.vcf ref.gb</code> (the code needs validation) | |||
# Data to database & web visualization, if necessary | |||
=hmmer= | |||
==Annotate proteins with TIGRFAM== | |||
<syntaxhighlight lang="bash"> | |||
hmmsearch --tblout foo.hmmout # table output for all sequences | |||
--domtblout foo.dmout # table output for all domains | |||
-E 0.01 # level of sequence significance | |||
--domE 0.01 # level of domain significance | |||
-o /dev/null # don't show STDOUT | |||
../../TIGRFAMs-Release-15-Sep-17-2014/TIGRFAMs_15.0_HMM.LIB # HMM profile library for tiger fams | |||
GCA_000583715.2_ASM58371v2_protein.faa & # input/query file in FASTA | |||
</syntaxhighlight> | |||
=PopGenome= | |||
<syntaxhighlight lang=R"> | |||
library(PopGenome) | |||
g = readVCF("pvt1.recode.vcf.gz", 1000, "8", 127890000, 128102000) | |||
pops = split(sample[,1], sample[,2]) # create a list of populations | |||
g = set.populations(g, pops, diploid = T) # set population names | |||
# by windows | |||
slide = sliding.window.transform(g, width = 100, jump = 20) # nsnps, not actual length | |||
slide = F_ST.stats(slide, mode = "nucleotide") | |||
snp.pos = slide@region.data@biallelic.sites # SNP positions | |||
win.num = length(slide@region.names) | |||
win.start = numeric() | |||
for (i in 1:win.num) {win.start[i] = snp.pos[[i]][1]} | |||
fst = slide@nuc.F_ST.vs.all | |||
pop.names = names(slide@populations) # population names | |||
plot(win.start, fst[,1], type ="n", las = 1, ylab = expression(F[st]), xlab = "SNP Position", ylim = c(0,0.4)) | |||
for (i in 1:length(slide@populations)) { | |||
lines(win.start, fst[,i], type = "l", col = pop.group[pop.names[i],4]) | |||
} | |||
arch.coords=c(127982050, 127992931) | |||
abline(v = arch.coords, col = "orange") | |||
#rect(xleft = arch.coords[1], ybottom = -1, xright = arch.coords[2], ytop = 0.5, border = "transparent", col = 2) | |||
</syntaxhighlight> | |||
=Velvet= | |||
==Cleaning== | |||
<syntaxhighlight lang="bash"> | |||
../../bbmap/bbduk.sh -Xmx1g | |||
in1=WGC067462_hunhewenku_509_combined_R1.fastq # gz file works as well | |||
in2=WGC067462_hunhewenku_509_combined_R2.fastq # gz file works as well | |||
-out1=clean1.fq | |||
-out2=clean2.fq | |||
qtrim=rl | |||
trimq=20 | |||
</syntaxhighlight> | |||
==Running Velvet Optimizer== | |||
<syntaxhighlight lang="bash"> | |||
srun ../../VelvetOptimiser-2.2.5/VelvetOptimiser.pl | |||
--t 32 | |||
--s 31 --e 31 --x 6 # kmer sizes | |||
-f '-shortPaired -fastq clean1.fq -shortPaired2 -fastq clean2.fq' | |||
-t 4 | |||
--optFuncKmer ‘n50’ | |||
-p prefix | |||
</syntaxhighlight> | |||
=PacBio assembly with canu= | |||
<code>./canu -p staph-auto-5 -d staph-auto-5 genomeSize=2.2m -pacbio-raw pac-reads-5.tar.gz</code> |
Latest revision as of 05:37, 2 November 2024
Weblogo
- install weblogo from github
- Or install with conda:
conda install -c conda-forge weblogo
- Or install with pip:
pip install weblogo
- run the following command
weblogo -f IR4-pep.fas -o ir4-logo.pdf -D fasta -F pdf -A protein -s large -t "IR4" -c chemistry
Tree visual with ggtree
setwd("C:/Users/Weigang/Dropbox/Natasha-files/")
library(ggtree)
library(treeio)
library(tidyverse)
# read tree
tree <- read.tree("asian_clade3_v2.nwk")
# create a tibble of OTU groups
tips <- tibble(id = tree$tip.label,
origin = c(rep("Eurasia",4), "US", "Eurasia",
"US", rep("Eurasia",25)))
# plot tree
p <- ggtree(tree) +
xlim(0,0.02) + # to avoid overflow of labels
geom_treescale(x=0, y=30)
# join tree and add group color
p %<+% tips +
geom_tiplab(aes(color = origin)) +
scale_color_manual(values = c("Eurasia" = 1, "US" = 2)) +
theme(legend.position = "none")
Github for sharing data and source code
- SSH setup (github no longer allows password-based push to remote repository)
- Create a new SSH key on your computer
- Add the key to your Github account
- Commit with SSH
- Authentication/test:
ssh -T git@github.com
- Add repository for commit:
git remote set-url origin git@github.com:bioperl/p5-bpwrapper.git
- Authentication/test:
- Developer workflow
- Clone remote repository:
git clone <rep-name.git>
- Sync local to remote repository:
git pull
- Check local repository status:
git status
- Show the latest commit:
git log
- Add new file:
git add <filename>
- Commit changes to local repository:
git commit -a -m "message"
- Push to remote repository:
git push
- Clone remote repository:
bcftools pipeline
bcftools view -m2 -M2 --types snps calls.bcf > snps.bcf vcftools --vcf snps4.vcf --maf 0.01 --recode --recode-INFO-all --out maf (n=33 SNPs) bcftools query -l maf.recode.vcf > samples (n=17775 samples) cat samples | while read line; do echo ">$line"; bcftools query -s "$line" -f '[%TGT]' maf.recode.vcf; echo; done > samples-maf.fas
R tips
Add expected normal curvce to histogram
plot.trait <- function(qt.out) {
phe <- qt.out[[4]]$phe
exp.mean <- qt.out[['trait.exp']][1]
exp.sd <- sqrt(qt.out[['trait.exp']][2])
x <- seq(min(phe), max(phe), length = 100)
fun <- dnorm(x, mean = exp.mean, sd = exp.sd)
hist(phe, probability = T, ylim = c(0, max(fun)), br = 50, las = 1)
lines(x, fun, col = 2, lwd = 2)
}
Run batch contingency table tests
- Numbered list item
snp_ct <- snp_long %>% group_by(POS, geno, virulence) %>% count()
# get pos with < 4 entries:
bad_pos <- snp_ct %>% group_by(POS) %>% count() %>% filter(n < 4)
library(broom)
# batch fisher's exact test
snp_fisher <- snp_long %>%
filter(!POS %in% bad_pos$POS) %>%
group_by(POS) %>%
do(tidy(xtabs(~geno + virulence, data = .) %>%
fisher.test(simulate.p.value = T)))
snp_fisher <- snp_fisher %>% mutate(y = -log10(p.value))
# plot vocano
snp_fisher %>%
ggplot(aes(x = estimate, y = y)) +
geom_point(shape = 1, alpha = 0.5) +
scale_x_log10() +
geom_vline(xintercept = 1, linetype = 2, color = "red") +
theme_bw() +
xlab("odds ratio (log10)") +
ylab("signficance (-log10[p])")
multiple regression models (with broom::tidy)
library(broom)
compensation.models <- compensation %>% group_by(Grazing) %>% do(tidy(lm(Fruit ~ Root, data = .))) %>% filter(term != '(Intercept)')
multiple regression models (with nested tibble)
# build multiple models, one for each serum:
by_serum <- two_od %>% group_by(Serum) %>% nest() # nested data frame, one row per serum
x_model <- function(df) { # model function (similar to lapply)
x <- df %>% filter(OspC!='A02' & OspC!='A04')
# x <- df %>% filter(OspC!='A02' & OspC!='A04' & OspC!='N14')
lm(OD1 ~ OD2, data = x)
}
by_serum <- by_serum %>% mutate(model = map(data, x_model)) # add model to each serum
output <- vector("list")
for(i in 1:length(by_serum$Serum)) {
df <- by_serum$data[[i]]
model <- by_serum$model[[i]]
pd <- predict.lm(model, newdata = data.frame(OD2 = df$OD2))
output[[i]] <- data.frame(OspC = df$OspC,
OD2 = df$OD2,
OD1 = df$OD1,
OD1_pred = pd,
Serum = rep(by_serum$Serum[i], nrow(df))
)
}
df.out <- bind_rows(output)
multiple regression models (with custum functions)
# build model
models <- xy %>% split(.$Serum) %>% map(~lm( aff_PL ~ aff_C3H, data= .))
# get r-squared:
models %>% map(summary) %>% map_dbl("adj.r.squared")
# get p values (of slope, using a custom function):
get.pvalue <- function(x){
s <- summary(x);
s$coefficients[2, 4]
}
models %>% map(get.pvalue) %>% unlist()
# get slope:
get.slope <- function(x){
s <- summary(x);
s$coefficients[2, 1]
}
models %>% map(get.slope) %>% unlist()
# get intercept:
get.intercept <- function(x){
s <- summary(x);
s$coefficients[1, 1]
}
models %>% map(get.intercept) %>% unlist()
Microbial genome alignment with MUMMER4 and other tools
SARS-CoV-2 GISAID pipleline
####################################### # Parse GISAID sequences fasta ###################################### 1. bioseq -B cov.fas # burst into individual files 1a. move the un-bursted file out the "human-files" directory!!! mv cov-human.fas ../ 2. sam align (weigang@wallace:~/cov-03-09-2030/host-human$ for f in *.fas; do ../sam-align.bash $f; done ) nucmer --sam-long=COH1 B111.fa COH1.fa samtools view -b COH1.sam -T B111.fa > COH1.bam samtools sort COH1.bam -o COH1.sorted.bam samtools index COH1.sorted.bam with script: for f in *.fas; do ../sam-align.bash $f; done 3c. Less strict call: bcftools mpileup -Ou -f ../ref.fas *.sorted.bam | bcftools call -mv --ploidy-file ploidy.txt -Ob -o calls.bcf -P 0.05 (or -P 0.1; large P value for less strict call, default 1.1e-3) # 3b. bcftools mpileup -Ou -f ../ref.fas *.sorted.bam | bcftools call -mv --ploidy-file ploidy.txt -Ob -o calls.bcf, with default ploidy as 1: weigang@wallace:~/cov-03-09-2020/cov57$ cat ploidy.txt * * * M 1 6a. bcftools view -m2 -M2 --types snps calls.bcf > snps.bcf ( get only biallelic SNPs) 6b. vcftools --vcf input_file.vcf --remove-indels --recode --recode-INFO-all --out cov.vcf # filter SNPs only 7. filter sites by allele counts: only informative sites bcftools view snps.bcf > snps.vcf vcftools --vcf snps.vcf --mac 2 --recode --recode-INFO-all --out snps2.vcf
Pipeline
###################### Host & Enviroment: wqiu@bioit.hunter.cuny.edu vcftools & clonalframe "source activate gbs" bcftools: "source activate ngs" 1. align fastq to ref 1) index reference genome bwa index B111.fa => 5 files: amb, ann, bwt, pac, sa 2) align reads to ref ls -1 *.gz | cut -d'.' -f1 | sort -u > sample.list byobu cat sample.list | while read line; do bwa mem B111.fa ${line}.R1.fastq.gz ${line}.R2.fastq.gz > $line.sam; done => sam file 2. convert to bam, sort by ref coordinates cat sample.list | while read line; do samtools view -F 4 -Sbh ${line}.sam | samtools sort -o ${line}.bam; done => acc.bam 1+2: wrapped to save space cat sample.list | while read line; do begin=$(date +"%D:%H:%M") echo -ne "$line ... started $begin ..."; bwa mem B111.fa ${line}.clean.1.fastq.gz ${line}.clean.2.fastq.gz > $line.sam 2> /dev/null; bwa_time=$(date +"%D:%H:%M") echo -ne "sam file generated: $line.sam on $bwa_time ..."; samtools view -F 4 -Sbh ${line}.sam | samtools sort -o ${line}.bam 2> /dev/null; bam_time=$(date +"%D:%H:%M") echo -ne "bam file generated: $line.bam on $bam_time ..."; rm $line.sam; echo "sam file deleted. Done"; done 3. index cat sample.list | while read line; do samtools index ${line}.bam; done => acc.bam.bai 4. Variant Call 1) make ploidy.txt file cat > ploidy.txt * * * M 1 2) pileup and make bcf file bcftools mpileup -Ou -f B111.fa *.bam | bcftools call -mv --ploidy-file ploidy.txt -Ob -o calls.bcf -P 0.05 ((or -P 0.1; large P value for less strict call, default 1.1e-3)) => calls.bcf 5. analyse calls.bcf; remove outlier SNPs and samples 1) check stats bcftools stats calls.bcf > call.stats ts/tv (transition/transversion) better greater than 3 check AF freq, remove the first class if necessary (e.g., vcf --maf 0.008) check counts: vcftools --vcf snps.vcf --counts --out snps Filter by maf counts (at least 2): vcftools --vcf snps-255.recode.vcf --recode --recode-INFO-all --out snps2 --mac 2 2) get only biallelic SNPs bcftools view -m2 -M2 --types snps calls-87.bcf > snps-87.vcf bcftools stats snps.vcf | ll bcftools view -m2 -M2 --types snps call-49.vcf > snps-49.vcf 3) merge samples bgzip snps-87.vcf bgzip snps-49.vcf => vcf.gz file bcftools index snps-87.vcf.gz bcftools index snps-49.vcf.gz => vcf.gz.csi file bcftools merge -Ob snps-87.vcf.gz snps-49.vcf.gz > snps-136.bcf get only biallelic SNPs bcftools view -m2 -M2 --types snps snps-136.bcf > snps-136.vcf ln -s snps-136.vcf snps.vcf # use vcftools to remove invariant sites, in case samples are dropped: vcftools --gzvcf snps2.vcf.gz --non-ref-ac-any 1 --recode --recode-INFO-all --out tmp --remove-indv IND 6. vcf to fasta 1) modify sample name (should use "bcftools reheader -s change-id.txt -o tmp.vcf snps.vcf") cat snps.vcf | sed 's/.bam//g; s/.sorted//g; s/batch-..//g' > tmp.vcf mv tmp.vcf snps-136.vcf 2) sample list bcftools query -l snps.vcf > sample.txt 3) make fasta: could be gzipped cat sample.txt | while read line; do echo ">$line"; bcftools query -s "$line" -f '[%TGT]' snps.vcf; echo; done > sample.fas Note: . in fas file means missing nt, handled correctly iqtree 4) get ref seq echo ">B31" > ref.fas ll snps.vcf => get ref id grep "^Chromosome" snps.vcf | cut -f4 | paste -s -d '' - >> ref.fas #grep "^CP021772" snps.vcf | cut -f4 | paste -s -d '\0' - >> ref.fas (#if on apple system) cat ref.fas >> sample.fas => 137 samples, 46783 snps go to genometracker: scp sample.fas snps-136.vcf weigang@genometracker.org:/mnt/bac_genome/GBS-Wu-Oct-2022/. on genometracker: ln -s snps-136.vcf snps.vcf # subset samples (e.g., bbss only): bcftools view -Ov -S samples-bbss.txt --force-samples snp5-lp17.vcf > snp5-lp17-bbss.vcf 7. make tree (with iqtree, for SNP-only alignment, with bootstrap; can't have constant sites) iqtree2 -s sample.fas -m GTR+ASC -B 1000 -nt AUTO (-n AUTO to use all CPUs on the cluster) # if containing full seq alignment with constant sites, e.g., HIV env gene alignment # iqtree -s $work_dir/test.fas -m GTR+F+G -B 1000 --redo -nt AUTO => sample.fas.contree 8. consequence call 1) download ref in genbank format NCBI nucleotide database: search CP021772 => B111.gb 2) convert genbank to gff3 format (1) py file: modified from https://dmnfarrell.github.io/bioinformatics/bcftools-csq-gff-format => gb2gff3.py (2) ./gb2gff3.py B111.gb => B111.gff3 (3) change acc to match ref name in snps.vcf cat B111.gff3 | sed 's/^CP021772.1/CP021772/' > tmp.gff3 mv tmp.gff3 B111.gff3 3) call consequence (to tsv or bcf files) bcftools csq -f B111.fa -g B111.gff3 snps.vcf -Ot -o snps-csq.tsv cut -f2,5,6 snps-csq.tsv | tr '|' '\t' | cut -f1-5,7- > snps-csq2.tsv 9. web development 1) orf info from genbank perl get-orf-by-acc.pl CP021772 > orf.csv 2) scp weigang@genometracker.org:/mnt/bac_genome/GBS-Wu-Oct-2022/sample.fas.contree . #biotree -m sample.fas.contree | biotree --ref 'B111' > sample.dnd #biotree -r 'GBS02' sample.fas.contree | biotree --ref 'B111' > sample.dnd #biotree -E 0.005 sample.fas.contree | biotree -r 'GBS02' | biotree --ref 'B111' > sample.dnd biotree -D 90 sample.fas.contree | biotree -r 'GBS02' | biotree --ref 'B111' > tree.dnd tree visualization by R or PhyloView or R/ggtree 3) scp weigang@genometracker.org:/mnt/bac_genome/GBS-Wu-Oct-2022/snps-csq2.tsv . 4) pheno.csv
clonal frame pipeline (WGS)
- Reconstitute whole-genome alignment from vcf; following this post: https://gatk.broadinstitute.org/hc/en-us/community/posts/360070002671-converting-multisample-vcf-to-fasta. Algorithm: split vcf into single-sample vcf's; use "bcftools consensus" to obtain alternate ref seqs. Environment: conda activate ngs (on bioit)
- create one-sample vcf's
cat ../samples-bbss.txt | while read line; do bcftools view -Ov -s $line ../bbss-cp26.vcf.gz > snp5-cp26-$line.vcf; done
- bgzip & tabit
for f in *.vcf; do bgzip $f; tabix $f.gz; done (or bcftools index) for f in snp5-lp54-*.vcf; do bgzip $f; bcftools index $f.gz; done
- Get consensus
From doc: "Create consensus sequence by applying VCF variants to a reference fasta file. By default, the program will apply all ALT variants to the reference fasta to obtain the consensus sequence. Using the --sample (and, optionally, --haplotype) option will apply genotype (haplotype) calls from FORMAT/GT." tail -27 ../samples-bbss.txt | while read line; do cat ../../B31-files/cp26_plasmid.fasta | bcftools consensus --sample $line snp5-cp26-$line.vcf.gz | sed "s/>cp26/>$line/" > bbss-cp26-$line.fa; done
- Add reference:
cat ../../B31-files/cp26_plasmid.fasta | sed "s/>cp26/>B31/" > bbss-cp26-B31.fa
concatenate & check length: cat *.fa > wgs-cp26.fasta bioseq -l wgs-cp26.fasta
check with bioaln for variability conda activate gbs bioaln -i 'fasta' -m wgs-cp26.fasta | less
- run ClonalFrame (conda env "gbs")
srun ClonalFrameML ss-cp26.dnd wgs-cp26.fasta wgs_cp26 -kappa 4.00 -emsim 100
Align two assemblies
- With minimap2: https://github.com/lh3/minimap2
minimap2 -ax asm5 ref.fa asm.fa > aln.sam # assembly to assembly/ref alignment
"For cross-species full-genome alignment, the scoring system needs to be tuned according to the sequence divergence."
"-au options: asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5% sequence divergence"
- With MUMMER4 (MUMMER4): align two genomes, output delta file:
- nucmer -p A909-COH1 COH1.fa A909.fa
- dnadiff -p A909-COH1 -d A909-COH1.delta
- less A909-ref.report, OR
grep AvgIdentity A909-ref.report
- output sam file
- nucmer --sam-long=COH1 B111.fa COH1.fa
- samtools view -b COH1.sam -T B111.fa > COH1.bam
- samtools sort COH1.bam -o COH1.sorted.bam
- samtools index COH1.sorted.bam
- Use GSAlign
conda config --add channels defaults conda config --add channels bioconda conda config --add channels conda-forge conda install gsalign gsalign index ref.fas cov gsalign -i cov -q test.fas
- bcf annotate with gff file:
bcftools csq -f ../ensembl-B31-downloads/lp54.fa -g ../ensembl-B31-downloads/lp54.gff3 -Ov -o lp54.anno.vcf lp54.vcf
- variant call with bcftools: https://samtools.github.io/bcftools/howtos/variant-calling.html
- bcftools cheatsheet: https://gist.github.com/elowy01/93922762e131d7abd3c7e8e166a74a0b
BERT classifier
- Project start: Winter 2020
- Project team: Saadmanul Islam <Saadmanul.Islam91@myhunter.cuny.edu>
- Tutorial page: https://medium.com/swlh/a-simple-guide-on-using-bert-for-text-classification-bbf041ac8d04
- Data set: bioluminascence
- Goal: cross validation
Estimate LD50 using GLM
## curves
library(dplyr)
library(ggplot2)
library(growthcurver)
library(reshape2)
library(purrr)
setwd("/Users/desireepante/Desktop/Programming")
OD20<-read.csv("OD_0220.csv")
OD220<-OD20[-c(6,11:12,20,26:28, 32:33,45:47),]
od <- filter(OD220, IPTG == 0);
#N15
od15d <- mutate(od15d, od.norm = OD/max(OD))
test.1<-glm(min ~ OD, data= od15d, family= "binomial")
ilink<-family(test.1)$linkinv
test.1.pd <- with(od15d,
data.frame(min = seq(min(min), max(min),
length = 10)))
test.1.pd <- cbind(test.1.pd, predict(test.1, test.1.pd, type = "link", se.fit = TRUE)[1:2])
test.1.pd <- transform(test.1.pd, Fitted = ilink(fit), Upper = ilink(fit + ( se.fit)),
Lower = ilink(fit - ( se.fit)))
test.test<-ggplot(od15d, aes(x = min, y = od.norm)) +
geom_ribbon(data = test.1.pd, aes(ymin = Lower, ymax = Upper, x = min),
fill = "steelblue2", alpha = 0.2, inherit.aes = FALSE) +
geom_line(data = test.1.pd, aes(y = Fitted, x = min)) +
geom_point()
test.test
1k genome project/VCFTools
# Extract APOL1 locus
vcftools --gzvcf ALL.chr22.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz --from-bp 36253071 --to-bp 36267530 --recode --stdout --recode-INFO-all --chr 22 | gzip -c > apol1.vcf.gz
# allele frequencies/counts
vcftools --vcf pvt1.recode.vcf --keep AFR.supergroup --counts --out afr
# pi
# Fst
vcftools --weir-fst-pop AFR.supergroup --weir-fst-pop AMR.supergroup --vcf pvt1.recode.vcf --out AFR-AMR --remove-indels
vcftools --weir-fst-pop AFR.supergroup --weir-fst-pop EAS.supergroup --vcf pvt1.recode.vcf --out AFR-EAS --remove-indels
vcftools --weir-fst-pop AFR.supergroup --weir-fst-pop EUR.supergroup --vcf pvt1.recode.vcf --out AFR-EUR --remove-indels
vcftools --weir-fst-pop AFR.supergroup --weir-fst-pop SAS.supergroup --vcf pvt1.recode.vcf --out AFR-SAS --remove-indels
vcftools --weir-fst-pop AMR.supergroup --weir-fst-pop EAS.supergroup --vcf pvt1.recode.vcf --out AMR-EAS --remove-indels
vcftools --weir-fst-pop AMR.supergroup --weir-fst-pop EUR.supergroup --vcf pvt1.recode.vcf --out AMR-EUR --remove-indels
vcftools --weir-fst-pop AMR.supergroup --weir-fst-pop SAS.supergroup --vcf pvt1.recode.vcf --out AMR-SAS --remove-indels
vcftools --weir-fst-pop EAS.supergroup --weir-fst-pop EUR.supergroup --vcf pvt1.recode.vcf --out EAS-EUR --remove-indels
vcftools --weir-fst-pop EAS.supergroup --weir-fst-pop SAS.supergroup --vcf pvt1.recode.vcf --out EAS-SAS --remove-indels
vcftools --weir-fst-pop EUR.supergroup --weir-fst-pop SAS.supergroup --vcf pvt1.recode.vcf --out EUR-SAS --remove-indels
# concatenate vcf from different chromosomes
vcf-concat apol1-exon-6-snps.vcf tvp23-exon-1-snps.vcf hpr-exon-5-snps.vcf > 3-exons.vcf
# vcf to tabular format
bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT[\t%SAMPLE=%GT]\n' 3-exons.vcf > tmp
vcf-query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT[\t%SAMPLE=%GT]\n' 3-exons.vcf > tmp
Ggplot2 tips
- Data binning and nice histograms with density and boxplot:
# numerical vector as factor so that it is sorted numerically (no need for as.character())
x.df$train.size <- as.factor(x.df$train.size)
# jitter by group with "position_jitterdodge()"
x.df %>%
ggplot(aes(x=rate.cat, y=accuracy, color=group)) +
geom_boxplot(position=position_dodge(0.8)) +
geom_jitter(position=position_jitterdodge(0.8)) +
facet_wrap(~train.size, nrow = 1) + ylim(0,1) +
geom_abline(intercept = 0.5, slope = 0, linetype="dashed") +
theme(legend.position = "bottom")
# line plot with multiple factors with "interaction()"
rec.df %>%
ggplot(aes(x = tissue, y = rec.rate)) +
geom_boxplot(outlier.shape = NA, aes(fill = tissue), alpha = 0.5) +
geom_point(size = 3, alpha = 0.5, aes(color = symptom)) +
geom_line(aes(group = interaction(id, timepoint)), linetype = 2) +
theme_bw() +
scale_fill_manual(values = c("lightgreen", "lightpink")) +
ylab("Num Rec frags") +
xlab("tissue") +
scale_y_log10()
Capture results in R/GA example
# in general use
lapply(seq_along(x), function(i) {name=names(x}[i])
# to capture names
# save to a data.frame
# bind into a single data.frame using dplyr function:
x.df <-bind_rows(x.rate)
library(stringdist)
library(ggplot2)
out.fit <- lapply(seq(from = 5,to = 150, by = 5), function (num.epi) {
num.alle <- 20; # num of antigens
epi.fit <- function(x) { # fitness defined by the minimum pair diff
x.list <- split(x, rep(1:num.alle, each=num.epi))
# sum(seq_distmatrix(x.list, method = "hamming"))/(num.alle * (num.alle-1) / 2)/num.epi # average pairwise diff
min(seq_distmatrix(x.list, method = "hamming"))/num.epi # min diff
}
epi.ga <- ga(type = "binary", fitness = epi.fit, nBits = num.epi * num.alle)
c(all = num.alle, epi = num.epi, min.pair.dist = epi.ga@fitnessValue)
})
epi.df <- t(as.data.frame(out.fit))
rownames(epi.df) <- 1:nrow(epi.df)
ggplot(as.data.frame(epi.df), aes(x=epi, y=min.pair.dist)) + geom_point() + geom_line()
NCI Cloud (Seven Bridges)
- Documentation: http://www.cancergenomicscloud.org/controlled-access-data/
- API Documentation (to access meta-data)
- Steps (June 20, 2018)
- Create project
- Data -> Data Overview -> Select cancer type -> Data Browser
- Copy files to project
- Go to project, "add filter", filter by "experimental strategy" (RNA-SEQ, miRNA-SEQ)
- To download files: select & Download link; run
aria2c -i download-links.txt
- To download meta data: Download manifest
- Steps (better)
- Create project
- Search "Public Apps"
- Load data & Run
- Steps (a little hard to follow)
- Create project
- Data -> Data Overview -> Select cancer type -> Data Browser
- Click File to add property (e.g., "miRNA") -> "Copy Files to Project" -> choose project to add & add a tag to the files
- Go to Project -> Files -> filter files by tag name -> select -> Save
- Choose a public protocol/workflow (e.g., RNA-SEQ alignment)
- Add index and GTF files
- Go to Project -> task -> run
Reloop a circular plasmid from PacBio assembly
- Input file: Z9-cp26.fa (containing overlapping ends); b31-cp26.fa (reference)
- Find overlap ends by blasting against itself: blastn -task megablast -query Z9-cp26.fa -subject Z9-cp26.fa -evalue 1e-100 -outfmt 6
- The result says “1 -7617” is the same as “26489-34118”, so we use bioseq to remove overlapping part: bioseq –s”1,26488” Z9-cp26.fa > Z9-cp26.fa2
- Run numcer against b31-cp26 to identify the starting position: nucmer --coord ../b31-cp26.fa Z9-cp26.fa2
- The result says the first base of B31 corresponds to 22171 at Z9-cp26, so we run bioseq to reloop: bioseq –R 22171 Z9-cp26.fa2 > Z9-cp26.fa3
- Run numcer to confirm: nucmer --coord ../b31-cp26.fa Z9-cp26.fa3
KEGG REST API
# Author: Edgaras Bezrucenkovas
# Date: Nov 12, 2017
#KEGG API
#Obtain module pathways and orthologs from KEGG pa14 database
#Import module to fetch the data from KEGG database
import urllib.request
import re #Regular expressions
#Get PA14 modules
kegg_db = "http://rest.kegg.jp/link/module/pau"
pa14_mdata = urllib.request.urlopen(kegg_db).read().decode('utf-8').splitlines()
pa14_mods = []
for line in pa14_mdata:
line = line.split("\t")
mod = line[1].strip("md:pau_")
pa14_mods.append(mod)
#Get PA14 genes
kegg_db = "http://rest.kegg.jp/link/pau/module"
pa14_gdata = urllib.request.urlopen(kegg_db).read().decode('utf-8').splitlines()
pa14_genes = []
for line in pa14_gdata:
gdata = line.split("\t")
gdata[0] = gdata[0].strip("md:pau_")
gdata[1] = gdata[1].strip("pau:")
pa14_genes.append(gdata)
#Open the file containing KEGG compounds ids
id_file = open("kegg-entries.txt")
kegg_ids = id_file.readlines()
compound_ids = []
for cid in kegg_ids:
cid = cid.strip("\n")
compound_ids.append(cid)
id_file.close()
#Get module pathway ids
for cid in compound_ids:
kegg_db = "http://rest.kegg.jp/link/module/compound:"+cid
module_data = urllib.request.urlopen(kegg_db).read().decode('utf-8').splitlines()
module_ids = []
for line in module_data:
if re.match("cpd", line):
line = line.split("\t")
m_id = line[1].strip("md:")
module_ids.append(m_id)
else:
module_ids.append("NA")
#Filter module pathway ids
modules_final = []
for m_id in module_ids:
if m_id == "NA":
modules_final.append("NA")
elif m_id in pa14_mods:
modules_final.append(m_id)
if not modules_final:
modules_final.append("NA")
#Get genes
genes = {}
for m_id in modules_final:
key = m_id
genes.setdefault(key, [])
if m_id == "NA":
genes[key].append("NA")
else:
for i in pa14_genes:
if m_id == i[0]:
genes[key].append(i[1])
#Get module names using module ids
module_names = {}
for m_id in modules_final:
if m_id == "NA":
m_name = {"NA":"NA"}
module_names.update(m_name)
else:
kegg_db = "http://rest.kegg.jp/list/module:"+m_id
module_data = urllib.request.urlopen(kegg_db).read().decode("utf-8").split("\t")
module_name = module_data[1].strip("\n")
m = {m_id:module_name}
module_names.update(m)
#Output
for k, v in genes.items():
for i in v:
print(cid,k,i,module_names.get(k), sep = "\t")
D3js Tutorial
- Install brackets editor from adobe: http://brackets.io/
- Enable web server & create a directory "public_html" in your home directory
- Create the following file structure: test/index.html; test/js/index.js; test/css/index.css
- Basic D3: following this link
- Metabolomics: following this link
Transform a wide data frame to a long one
- A function in R
df2long = function(x, varname, cname, rname){
num.col = dim(x)[2]
x$tmp = rownames(x)
y = reshape(x, varying=1:num.col, v.names=varname, timevar=cname, times=names(x)[1:num.col], direction="long")
y=y[,-4]
rownames(y)=NULL
colnames(y)[1] = rname
y
}
- A function in Perl
#!/usr/bin/env perl
# expect a data frame with row and col headings
use strict;
use warnings;
use Data::Dumper;
my $first = 0;
my @colnames;
my %data;
while(<>) {
chomp;
$first++;
if ($first == 1) {
@colnames = split;
next;
}
my @data = split;
my $rowname = shift @data;
for (my $i=0; $i<=$#data; $i++) {
$data{$rowname}->{$colnames[$i]} = $data[$i];
}
}
#print Dumper(\%data);
foreach my $row (keys %data) {
foreach my $col (@colnames) {
print $row, "\t", $col, "\t", $data{$row}->{$col}, "\n";
}
}
exit;
Variant call with samtools/bcftools & variant verification using IGV
- mpileup:
samtools mpileup -uf ref.fa sorted1.bam sorted2.bam ...
- call variants:
bcftools call -mv > var.raw.vcf
OR: bcftools call -c -v --ploidy 1 gbs-50.mpileup > raw.vcf
- extract only SNP sites:
vcftools --vcf input_file.vcf --remove-indels --recode --recode-INFO-all
- filter by quality:
bcftools filter -i '%QUAL>200' gbs50-snps.recode.vcf > gbs50-snps2.vcf
- check by TsTv:
bcftools stats gbs50-snps2.vcf | grep TSTV
- Extract SNPs into Fasta:
- Or, with bcftools:
bcftools query -l gbs50-snps2.vcf > samples
cat samples | while read line; do echo ">$line"; bcftools query -s "$line" -f '[%TGT]' gbs50-snps2.vcf; echo; done > samples.fas
- get ref seq:
echo ">ref" >> sample.fas; grep "^CP" gbs50-snps2.vcf | cut -f4 | paste -s -d >> sample.fas
- Visualization with IGB
- Prepare & load reference genomes
- Load FASTA:
bioseq -i'genbank' ref.gb > ref.fas
- Prepare GFF3 file:
bp_genbank2gff3.pl --CDS --filter exon --filter gene /home/chongdi/michelle/pa_rp73.gb
- Prepare BAM files (see below on how to index and align reads using bwa and samtools, in "ospC Amplicon")
- Index sorted bam files with samtools & load BAM files (multiple bam files okay)
KRAKEN
- assigns taxonomic labels to short DNA reads by examining the k-mers within a read and querying a database with those k-mers:
kraken -db /belfer-ebox/projects/old_backup/qiulab/minikraken_20141208 --fastq-input 02015P1_S18_L001_R1_001.fastq.gz 02015P1_S18_L001_R2_001.fastq.gz --gzip-compressed --output outfile --paired
- summarize taxonomy:
kraken-translate --db /belfer-ebox/projects/old_backup/qiulab/minikraken_20141208 outfile > predict.file
- count unique strains:
pat-8]$ cut -f2 s75.predict | cut -f10 -d';' | grep "^[a-zA-Z]" | sort | uniq -c | sort -rn
.
- Pick the strain with the highest counts as the reference genome
PATRIC microbial genome annotation CLI
- Install Debian distribution from github: https://github.com/PATRIC3/PATRIC-distribution/releases/tag/1.015
- Tutorial: http://tutorial.theseed.org/PATRIC-Tutorials/
- Fetch all genomes from a genus:
p3-all-genomes --in genome_name,Borrelia > Borrelia.genomes
; p3-get-genome-data -i Borrelia.genomes -e genome_status,complete > Borrelia-complete.genomes
- Retrieves features from coding sequences only: input file must be a list of genome ids with a header!
p3-get-genome-features -i $file -e feature.feature_type,CDS -a pgfam_id -a patric_id -a plfam_id -a start -a end -a strand -a genome_name -a product -a accession > ${name}-all-features.txt
- Retrieve feature sequence:
p3-echo -t feature.accession "fig|100177.4.peg.1" | p3-get-feature-sequence --dna
- Submit genomes for annotation
- Log in: p3-login user name/password
- Submit a genome for annotation:
submit-patric-annotation --scientific-name "Pseudomonas aeruginosa" --taxonomy-id 287 --genetic-code 11 --domain Bacteria --log pa.log /weigang@patricbrc.org/home/wsdir gid_5.fas
submit-patric-annotation --scientific-name "Streptococcus agalactiae" --taxonomy-id 1311 --genetic-code 11 --domain Bacteria --log sa-1.log /weigang@patricbrc.org/home/wsdir sa-1.fas
- Message: "Uploading gid_5.fas to workspace at /weigang@patricbrc.org/home/wsdir/gid_5/gid_5.fas”
- Create genome groups
- login
cat trep.complete.ids | p3-put-genome-group "Treponema complete”
p3-get-genome-group "Treponema complete”
Parsimony reconstruction using MPR
# MPR for each column
plot.mpr <- function(column=1) {
plot.phylo(tr.unrooted, main = paste(colnames(ph.t)[column]))
tmpr<-MPR(ph.t[,column], tr.unrooted, outgroup = "gid_1311.1320")
nodelabels(paste("[", tmpr[, 1], ",", tmpr[, 2], "]", sep = ""))
tiplabels(ph.t[,column][tr.unrooted$tip.label], adj = -2)
}
# add internal node states
datalist <- data.frame(fam=character(), a=numeric(), b=numeric(), c=numeric())
for (i in 1:ncol(ph.t)) {
#for (i in 1:10) {
tmpr<-MPR(ph.t[,i], tr.unrooted, outgroup = "gid_1311.1320");
out<-c(colnames(ph.t)[i],tmpr[,1])
datalist <- rbind(datalist, data.frame(fam=colnames(ph.t)[i], a=tmpr[1,1], b=tmpr[2,1], c=tmpr[3,1]))
}
#plotting gains and losses
tr.unrooted <- unroot(tr)
gains<-apply(ph.node.states[,9:15], 2, function(x) length(which(x>0)))
losses<-apply(ph.node.states[,9:15], 2, function(x) length(which(x<0)))
plot(tr.unrooted)
edgelabels(gains, adj = c(0.5,-0.25), col=2, frame= "none")
edgelabels(losses, adj = c(0.5,1.25), col=4, frame= "none")
ospC amplicon identification
#Index nucleotide file:
bwa index ref.fa
#align:
bwa mem ref.fa sample_read1.fq sample_read2.fq > sample.sam
#BAM output:
samtools view -b sample.sam > sample.bam
#Sort & index BAM file:
samtools sort -o sample.sorted sample.bam
samtools index sample.sorted
# Calculate coverage by bedtools (better, raw read coverages)
Based on the update: https://bioinformatics.stackexchange.com/questions/2318/how-to-count-reads-in-bam-per-bed-interval-with-bedtools
bedtools bamtobed -i sample.sorted > sample.bed # bam to bed
bedtools coverage -b sample.bed -a ospC.bed > sample.coverage
Alternatives (not good):
bedtools coverage -b sample.sorted.bam -a refs.bed -d > sample.bedtools.cov # per-site
bedtools coverage -b sample.sorted.bam -a refs.bed -counts > sample.bedtools.cov # average
# Generate VCF file (not needed for coverage)
samtools mpileup -uf ref.fa s18.sorted.bam s56.sorted.bam > test.mpileup
bcftools call -c -v test.mpileup > test.raw.vcf
#R plot
# BASH script from Tony Fung & Jamie Cabigas (Spring 2019)
#!/usr/bin/env bash
# ----------------------------------------
# File : fas-to-vcf.sh
# Author : Tony Fung & Jamie Cabigas
# Date : March 21, 2019
# Description : Batch convert FAS files to VCF files
# Requirements : samtools 1.9, bcftools 1.9, bwa 0.7.12
# Input : reference file, format of sequence files, up to 7 sequence id's
# Sample Command : bash fas-to-vcf.sh ref.fasta fastq ERR221622 ERR221661 ERR221662 ERR221663
# Output : up to 7 VCF files
# ----------------------------------------
if [ $# -lt 3 ]
then
echo "Usage: bash $0 <ref_file> <format_of_sequence_files> <sequence_id> [<sequence_id_2> <sequence_id_3> etc.]";
exit;
fi
for arg
do
if [ $arg == $1 ]
then
ref_file="../$1";
elif [ $arg == $2 ]
then
file_format=$2;
else
seq_id=$arg;
fas_file_1="${seq_id}_1.$file_format";
fas_file_2="${seq_id}_2.$file_format";
pwd;
cd $seq_id;
pwd;
# Index nucleotide file:
bwa index $ref_file;
# Align:
bwa mem $ref_file $fas_file_1 $fas_file_2 > $seq_id.sam
# BAM output:
samtools view -b $seq_id.sam > $seq_id.bam
# Sort & index BAM file:
samtools sort $seq_id.bam -o $seq_id.sorted.bam
samtools index $seq_id.sorted.bam
# Generate VCF file:
# -u is deprecated, use bcftools mpileup instead
# samtools mpileup -uf $ref_file $seq_id.sorted.bam > $seq_id.mpileup
bcftools mpileup -f $ref_file $seq_id.sorted.bam > $seq_id.mpileup
bcftools call -c -v $seq_id.mpileup > $seq_id.raw.vcf
cd ..;
pwd;
fi
done
Running a Screen Session
use byobu
- start a screen session by typing "byobu"
- run commands
- Detach by pressing "F6"
- Reattach by typing "byobu"
- Terminate by typing "exit"
use screen
- Start a screen session
screen
- Detach the running mission
ctrl + A + D
- Show the list of running process
screen -ls
- Reattach a running process
screen -r ProcessID
- Terminate a process
ctrl + A
:quit
Bp-utils: sequence, alignment & tree utilities by Qiu Lab
bioseq: sequence/FASTA manipulations
- Use accession "CP002316.1" to retrieve the Genbank file from NCBI. Save the output (in genbank format) to a file named as "cp002316.gb".
bioseq -f "CP002316.1" -o'genbank' > cp002316.gb
- Use the above file as input, extract FASTA sequences for each genes and save the output to a new file called "cp002316.nuc". Use this file for the following questions.
bioseq -i "genbank" -F cp002316.gb > cp002316.fas
- Count the number of sequences.
bioseq -n cp002316.fas
- In a single command, pick the first 10 sequences and find their length
bioseq -p "order:1-10" cp002316.fas | bioseq –l
- In a single command, pick the third and seventh sequences from the file and do the 3-frame translation. Which reading frame is the correct on both? Specify
bioseq -p "order:3,7" cp002316.fas | bioseq -t3
- Find the base composition of the last two sequences
bioseq -p "order:25-26" cp002316.fas| bioseq –c
- Pick the sequence with id "Bbu|D1_B11|8784|9302|1" and count the number of codons present in this sequence
bioseq -p "id:BbuJD1_B11|8784|9302|1" cp002316.fas | bioseq –C
- Delete the last 10 sequences from the file and save the output to cp002316-v2.nuc
bioseq -d "order:17-26" cp002316.fas > cp002316-v2.nuc
- In a single command, pick the first sequence, then get the 50-110 nucleotides and make reverse complement of the sub-sequences
bioseq -p "order:1" cp002316.fas | bioseq -s "50,110" | bioseq –r
- In a single command, get the first 100 nucleotides of all the sequences present in the file and do 1-frame translation of all sub-sequences.
bioseq -s "1,100" cp002316.fas | bioseq -t1
bioaln: alignment/CLUSTALW manipulations
- Go to /home/shared/LabMeetingReadings/Test-Data and find the sequence alignment file “bioaln_tutorial.aln”. Name the format of the alignment file. Use it to answer all the questions below.
CLUSTALW
- Find the length of the alignment.
bioaln -l bioaln_tutorial.aln
- Count the number of the sequences present in the alignment.
bioaln -n bioaln_tutorial.aln
- How do you convert this alignment in phylip format? Save the output.
bioaln -o "phylip" bioaln_tutorial.aln > test.phy
- Pick “seq2, seq5, seq7, seq10” from the alignment and calculate their average percent identity.
bioaln -p "seq2, seq5, seq7, seq10" bioaln_tutorial.aln | bioaln -a
- Get an alignment slice from “50-140” and find the average identities of the slice for sliding windows of 25.
bioaln -s "50, 140" bioaln_tutorial.aln | bioaln -w "25"
- Extract conserved blocks from the alignment.
bioaln -B bioaln_tutorial.aln
- Find the unique sequences and list their ids.
bioaln -u bioaln_tutorial.aln | bioaln -L
- Extract third sites from the alignment and show only variable sites in match view.
bioaln -T bioaln_tutorial.aln | bioaln -v | bioaln -m
- Remove the gaps and show the final alignment in codon view for an alignment slice “1-100”.
bioaln -s "1, 100" bioaln_tutorial.aln | bioaln -g | bioaln -c
- Add a 90% consensus sequence and then show the final alignment in match plus codon view for an alignment slice “20-80”. (Hint: match view followed by codon view)
bioaln -s "20, 80" bioaln_tutorial.aln | bioaln -C "90" | bioaln -m | bioaln -c
biotree: tree/NEWICK manipulations
biopop: SNP statistics
Homology searching and clustering
BLAST+: search("google") for homologs/pariwise alignment
hmmer
- link: http://hmmer.org/
- Installation:
% conda install -c biocore hmmer
# Anaconda
- Build profile:
hmmbuild globins4.hmm globins4.sto
- Search:
hmmsearch globins4.hmm uniprot_sprot.fasta > globins4.out
cdhit
cdhit -i all.pep -o all.cdhit -c 0.5 -n 3
Options:
- -i: input file
- -o: output file
- -c: percent identity (below which it is considered different families)
- -n: word length
interproscan
../../software/interproscan/interproscan-5.13-52.0/interproscan.sh -i trep-cdhit.representatives.pep2 -o trep-representatives.tsv -t p -goterms -pa -f tsv
Documentation page: How to run
Programs for producing multiple alignments
MUSCLE
MUGSY
- MUGSY bash
#!/bin/sh
export MUGSY_INSTALL=/home/weigang/mugsy_x86-64-v1r2.2
export PATH=$PATH:$MUGSY_INSTALL:$MUGSY_INSTALL/mapping
export PERL5LIB=$MUGSY_INSTALL/perllibs
#For testing TBA
#export PATH=$PATH:$MUGSY_INSTALL/../../multiz-tba/trunk/
- source the bash file
source mugsyenv.sh
- run mugsy
mugsy --directory /home/chongdi/Streptococcus/mugsy-output -prefix mugsy_aln mugsy-input/*.fa
CLUSTALW
MAFT
TCOFFEE
Programs for producing phylogeny & phylogenetic analysis
IQ-Tree
- Beginner's tutorial
- Advanced tutorial
- Model search (slow!! better done with byobu)
iqtree -s example.phy -m MFP
- protein tree with fast bootstrap:
iqtree -s example.phy -m WAG+I+G -bb 1000
(version 1.xx; -B for latest version)
- site-specific rates:
iqtree -s example.phy -rate
iqtree -s example.phy -wsr
(version 1.xx; -t and -m from above to save time)
- ancestral state reconstruction:
iqtree -s example.phy -asr
(-t and -m from above to save time)
FastTree
PHYLIP
MrBayes
RaXML
- Required arguments
- -s alignment (in PHYLIP or FASTA)
- -n tag
- Simple run (ML):
raxmlHPC-SSE3 -x 12345 -p 12345 -# autoMRE -s concat.fas -m GTRGAMMA -n tag -q part.txt
- Bootstrap:
raxmlHPC-SSE3 -f a -x 12345 -p 12345 -# 100 -s concat.fas -m GTRCAT -n tag -q part.txt
- Make a file named "part.txt" with the following lines (Chanlge the number to the total length of your alignment):
DNA, gene1codon1 = 1-3765906\3
DNA, gene1codon2 = 2-3765906\3
DNA, gene1codon3 = 3-3765906\3
- The resulting files
- RAxML_bestTree.tag: best tree (no bootstrap)
- RAxML_bipartitionsBranchLabels.tag: ignore
- RAxML_bipartitions.tag: main result. Feed this tree to figtree
- RAxML_bootstrap.tag: ignore
- RAxML_info.tag : log file
- Protein models
raxmlHPC-SSE3 -s protein.phy -n A1 -m PROTGAMMAWAS
# protein, gamma, Whelan & Goldman (2001) model
raxmlHPC-SSE3 -s protein.phy -n A2 -m PROTGAMMAGTR
# protein, gamma, user model
raxmlHPC-SSE3 -s protein.phy -n A1 -m PROTGAMMAWAG -# 100 -p 0123
# protein, gamma, Whelan & Goldman (2001) model, bootstrap
raxmlHPC-SSE3 -s protein.phy -n A1 -m PROTGAMMAWAG -o Carp,Loach
# protein, gamma, Whelan & Goldman (2001) model, root on a (monophyletic group)
raxmlHPC-SSE3 -s protein.phy -n A1 -m PROTGAMMAWAS -# autoFC -p 0123
# protein, gamma, Whelan & Goldman (2001) model, bootstrap (at least 99 splits, auto-stopping)
raxmlHPC-SSE3 -f a -s protein.phy -n A1 -m PROTGAMMAWAG -# 100 -p 0123 -x 0123
# Rapid bootstrap with consensus
- output 1, RAXML_bestTree.A1 (ml tree)
- output 2, RAXMLbootstrap.A1 (bootstrap relicates)
- output 3, RAXMLbipartitions.A1 (tree with boot strap values)
PhyloNet
R packages for phylogenetics
APE
Convert a tree to ultrametric using time estimates
t <- read.tree("core-tree-30otus.dnd")
t.ultra <- chronopl(t, 0)
t.hclust <- as.hclust(t.ultra)
t.dendro <- as.dendrogram(t.hclust)
heatmap(ge.var, Rowv=t.dendro) # order the rows with customized tree
phengorn
phytools
Population genetics
LDHat: test of recombination based on 4-gametes
- filter sites:
convert -seq snps.sites -loc snps.locs -freqcut 0.08
- Pairwise LD tests: First generate "lookup" table:
lkgen -lk /usr/local/LDhat/lk_files/lk_n50_t0.001 -nseq 48
; then calculate pairwise LD statistics: pairwise -seq sites.txt -loc locs.txt -lk new_lk.txt
- Run interval for recombination hot spots:
interval -seq sites.txt -loc locs.txt -lk new_lk.txt -its 1000000 -bgen 5 -exact
- Get stats:
stat -input rates.txt -loc locs.txt -burn 50 (burnin=100K)
- Plot in R:
x = read.table("rates.txt", skip=1, fill=T);
x = as.matrix(x);
burn.in = 50;
low = as.integer(nrow(x)*burn.in/100);
means<-apply(x[low:nrow(x),], 2, mean, na.rm=T);
q.95<-apply(x[low:nrow(x),], 2, quantile, probs=c(0.025, 0.5, 0.975), na.rm=T);
pos<-as.vector(as.matrix(read.table("locs.txt", as.is=T, skip=1)));
plot(pos[1:(length(pos)-1)], y=means[2:length(means)], type="s", col=rgb(0,0,0.5),
xlab="SNP position (B111)", ylab="Posterior mean recombination rate", main="GBS Strep: recombination by LDhat");
lines(pos[1:(length(pos)-1)], y=q.95[1,2:length(means)], type="s", col=grey(0.75), lty="dotted");
lines(pos[1:(length(pos)-1)], y=q.95[3,2:length(means)], type="s", col=grey(0.75), lty="dotted");
ms: coalescence simulation
SFS: forward simulation
PAML: testing selection with Ka/Ks
Microbial genome databases & pipelines in Qiu Lab
borreliabase
pa2
spiro_genomes/treponema
Blast a set of genes against a bacteria genome
- download genome
- extract gene sequences & translate
- Make blast database
- Run blastp
de novo variant call with cortex_var
Create binary file of fasta genome file.
Run contex_var_31_c1 (cutoff 1 used for 1 genome)
- --se_list is the command the reads the list you want to target (ie: list-genome.txt)
- --kmer_size is the middle size, has to be an odd integer
- --mem_width always choose 17
- --mem_height always choose 100
- --dump_binary Name your file name (ie: Genome.ctx)
/home/weigang/CORTEX_release_v1.0.5.21/bin/cortex_var_31_c1
--se_list list-Evo.txt
--kmer_size 31
--mem_width 17
--mem_height 100
--dump_binary Evo.ctx
> Evo.log save log file
Read each binary file (.ctx) into its own individual color list (ls Evo.ctx > Evo.colorlist)
Then save these lists into their own collective colorlist.txt (ls *.ctx > colorlist.txt)
Reveal genetic variation using the Bubble Caller from cortex_var.
/home/weigang/CORTEX_release_v1.0.5.21/bin/cortex_var_31_c5
--se_list colorlist.txt
--kmer_size 31
--mem_width 17
--mem_height 100
--dump_binary all-colors.ctx
> all-colors.log save log file
Bubble caller will detect differences between each genome by assigning distinct colors to each genome (note that the UK spelling of color is used: colour)
- --multicolour_bin holds your all-colors.ctx binary from the Bubble Caller
- --detect_bubbles1 i/i Detects 1 variation between genomes i and i. i indicates the position number the genome is listed on the colorlist.txt file. If the genome is fourth on the colorlist.txt, for example, its corresponding i variable is 4
- --output_bubbles1 Output variant reads in fasta format (ie: Evo-RefHG.var for bubble detection between
Evolved genome and Reference HG genome)
- --print_colour_coverages necessary for output
/home/weigang/CORTEX_release_v1.0.5.21/bin/cortex_var_31_c5
--kmer_size 31
--mem_height 17
--mem_width 100
--multicolour_bin all-colors.ctx
--detect_bubbles1 0/1
--output_bubbles1 Evo-RefHG.var
--print_colour_coverages
> Evo-RefHG.log save log file
Variant call with cortex_var
Example
- Files
MS00018_S5_L001_R1_001.fastq
MS00018_S5_L001_R2_001.fastq
KU-090401_S3_L001_R1_001.fastq
KU-090401_S3_L001_R2_001.fastq
...
- File content
@M03268:52:000000000-AJFAY:1:1101:16970:1555 1:N:0:7
CCCATGAACGGCACGTTCACGATGCAGAAGGTGGTGACCAGGCCGGTGTCGGCGACCTCGACGTAATCGTCGGTGGGGGA
GCCGTCGCGCGGGTCCGCGCCGCGCGGCGGGAAGTACACCTTGCCGTCCATGCGGGCGCGGGCGCCCAGCAGACGACCCT
CCATGAGCGCATTCAGGAATTCGGCCTCCTGCGGCGCGGCGGTGTGCTTGATATTGAAGTCGACCGGGGTCACGATGCCG
GTGACCGGTT
+
11>1>111@11>1A1FGF1FC0F0A111010GB0ECBFGF0/AFECGGFHECE??EEGHE/EEEHEFHEHHGCEECE??/
<CFCGGCCGCCCCCCHCGGCCCG@G??@@?@-@BFFFFFFFFFFF;B@FBFBFF<?@;@@-=?@??@-EFFFFF@;@@@F
FFBF/BBF;@@FFFFFFFFFFFFF@FFFFFFF<@@@@@@@@@@@FBFFFFFFFFFFFFFFFFF@@@?@?@FFFFFFFBF@
?@@EFF@@<B
@M03268:52:000000000-AJFAY:1:1101:16136:1618 1:N:0:7
TCCTGGCCCGTGAAACCGCTTGCCCGGTACAGGTTCTGGACTACCGCCTGGCACCCGAGCATCCGTTCCCGGCGGCGCTC
GACGACGCCGAGGCGGCGTTCCTGGAACTGGTGGCCGCCGGCTACCGGCCCGAACGCATTGCGGTCGCGGGTGATTCGGCCGGTGGCGGGCTCTCGCTCGCGCTGGCCGAACGGCTGCGCGACCGGCACGGGCTGGTTCCGGCCGCGCTCGGGCTGATCGCGCCCTGGGC
+
11>>11C1A@A?11BDF?EEGAFGFG?ECCHBHHGFG1EGHFHHHCCGGC/GCGGGCEECEFGHGGFFGHGGGGCECCC<CCC@CCCC@CGCCCGGC?:@EBFBF/CFFFF0/CFG?=@=-9>AFF@=@@?@;@@@F--9:BF@A@@?@--;9-BFFFF@A-@99B?-9@?=EFFBAAE;9>?;@@BF@9-@-;@=-E@@@=@@;@?>@9@<?@?BBBFFF;?>;;-@@?@<?@?9-FBFF@-99-9E-9
...
Step 1
- Create matched FASTQ files (python script)
#!/usr/bin/python
from sys import argv
script, File1, File2 = argv
# Create a dictionary listing the sequences in the first file for reference
file1 = open(File1)
dict1 = {}
for line in file1:
if '@M03268' in line:
tag1 = line.rstrip()[:-9]
tail = line.rstrip()[-9:]
dict1[tag1] = []
else:
dict1[tag1].append(line.rstrip())
file1.close()
# Create two output files
f1 = open(File1.replace('.fastq', '_mat.fq'), 'w')
f2 = open(File2.replace('.fastq', '_mat.fq'), 'w')
# Match the sequence
file2 = open(File2)
for line in file2:
if dict1.has_key(line.rstrip()[:-9]): # The has_key method
tag1 = line.rstrip()[:-9]
f1.write(tag1 + tail + '\n')
for j in range(3):
f1.write(dict1[tag1][j] + '\n')
del dict1[tag1]
dict2 = {} # Create a temporary dictionary for sequence in the file2
tag2 = line.rstrip()
dict2[tag2] = []
else:
dict2[tag2].append(line.rstrip())
if len(dict2[tag2]) == 3:
f2.write(tag2 + '\n')
for j in range(3):
f2.write(dict2[tag2][j] + '\n')
file2.close()
f1.close()
f2.close()
Step 2. Clean Fastq Files & Run Single-color Graph & Error Cleaning
- Create a file list showing all outcome files whose extensions need to be changed from _mat.fq to .list
bbduk.sh -Xmx1g in1=fastq_file1 in2=fastq_file2 -out1=clean1.fq -out2=clean2.fq qtrim=rl trimq=20
- Create a file list showing all outcome files whose extensions need to be changed from _mat.fq to .list
for f in *_mat.fq;
do
title=$(echo $f | cut -d'_' -f2);
id=$(echo $f | cut -d'_' -f1);
echo $f > ${id}.list${title};
done
- Single color graph for sample
../../CORTEX_release_v1.0.5.21/bin/cortex_var_31_c1
--pe_list MS00018_S5.list1,MS00018_S5.list2
--kmer_size 31
--mem_height 17
--mem_width 100
--dump_binary MS00018_S5.ctx
--sample_id MS00018_S5
--remove_pcr_duplicates
--quality_score_threshold 20 > MS00018_S5.log
- Single color graph for reference
../../CORTEX_release_v1.0.5.21/bin/cortex_var_31_c1
--se_list ref.filelist
--kmer_size 31
--mem_height 17
--mem_width 100
--dump_binary ref.ctx
--sample_id ref 20 > ref.log
- Run Error Cleaning for All Samples (reference is not included)
../../CORTEX_release_v1.0.5.21/bin/cortex_var_31_c1
--mem_height 18
--mem_width 100
--kmer_size 31
--multicolour_bin N18_S15.ctx
--remove_low_coverage_supernodes 10
--dump_binary N18_S15.cleaned.ctx
Step 3
- Pull the name of each .cleaned.ctx file to a cleaned.list file, then create a .filelist file for all cleaned.list files.
ls file1.cleaned.ctx > file1.cleaned.list
ls file2.cleaned.ctx > file2.cleaned.list
ls ref.ctx > ref.list
ls -1 *.list > ref-sample.filelist
- Multicolour Graph
../../CORTEX_release_v1.0.5.21/bin/cortex_var_31_c3
--mem_height 20
--mem_width 100
--kmer_size 31
--colour_list ref-sample.filelist
--dump_binary ref-sample.ctx > ref-sample.log
Step 4
- Variation Discovery Using The Bubble Caller
../../CORTEX_release_v1.0.5.21/bin/cortex_var_31_c3
--mem_height 20
--mem_width 100
--kmer_size 31
--multicolour_bin ref-sample.ctx
--detect_bubbles1 -1/-1
--ref_colour 2
--output_bubbles1 bubbles-in-sample.out
--print_colour_coverages
--experiment_type EachColourAHaploidSampleExceptTheRefColour
--genome_size 8000000 > bubbles-in-sample.log
Step 5
- Reference Genome
(When in Cluster execute "module load stampy": doesn't work; path problem)
(Run the following on wallace:)
stampy.py -G ref ref.fa
stampy.py -g ref -H ref
- Turn Into VCF with reference
Make a sample list file (from bubble or multicolor log file):
cat e1-bubbles-in-sample.log | grep CLEANED | cut -f2 > e1.sample.lis
Customize the following command based on your output files, num of colors, index of ref colors, etc
perl /home/weigang/CORTEX_release_v1.0.5.21/scripts/analyse_variants/process_calls-wq.pl --callfile e1-bubbles-in-sample.out --callfile_log e1-bubbles-in-sample.log -outvcf e1-bubbles-in-sample --outdir e1-vcfout --samplename_list e1.sample.list --num_cols 7 --stampy_bin /home/weigang/stampy-1.0.28/stampy.py --stampy_hash ref --refcol 6 --vcftools_dir /usr/local/bin --caller BC --kmer 31 --ploidy 1
Step 6. Parse VCF files
- Filter out low-quality sites:
vcftools --vcf pat-5.decomp.vcf --keep-filtered PASS --recode --out pat-5
- Extract coverage
vcftools --vcf pat-5.recode.vcf --extract-FORMAT-info COV
(output file: out.COV.FORMAT)
- Extract genotypes
vcftools --vcf pat-5.recode.vcf --extract-FORMAT-info GT
(output file: out.GT.FORMAT)
- Extract confidence
vcftools --vcf pat-5.recode.vcf --extract-FORMAT-info GT_CONF
(output file: out.GT_CONF.FORMAT)
- Send GT_CONF file to R and visualize log10(conf) distribution with boxplot
- Use custom PERL file to filter out low-quality (e.g., GT_CONF < 30) genotype calls (flag with "?" or "na"), and make haplotype GT ("1/1" to "1", "0/0" to "0", "./." to "?)
- Verify variants using IGV (see IGV protocol above)
Step 7. Variant Annotation & Visualization
vcf_parser.py out.decomp.vcf ref.gb
(the code needs validation)
- Data to database & web visualization, if necessary
hmmer
Annotate proteins with TIGRFAM
hmmsearch --tblout foo.hmmout # table output for all sequences
--domtblout foo.dmout # table output for all domains
-E 0.01 # level of sequence significance
--domE 0.01 # level of domain significance
-o /dev/null # don't show STDOUT
../../TIGRFAMs-Release-15-Sep-17-2014/TIGRFAMs_15.0_HMM.LIB # HMM profile library for tiger fams
GCA_000583715.2_ASM58371v2_protein.faa & # input/query file in FASTA
PopGenome
library(PopGenome)
g = readVCF("pvt1.recode.vcf.gz", 1000, "8", 127890000, 128102000)
pops = split(sample[,1], sample[,2]) # create a list of populations
g = set.populations(g, pops, diploid = T) # set population names
# by windows
slide = sliding.window.transform(g, width = 100, jump = 20) # nsnps, not actual length
slide = F_ST.stats(slide, mode = "nucleotide")
snp.pos = slide@region.data@biallelic.sites # SNP positions
win.num = length(slide@region.names)
win.start = numeric()
for (i in 1:win.num) {win.start[i] = snp.pos[[i]][1]}
fst = slide@nuc.F_ST.vs.all
pop.names = names(slide@populations) # population names
plot(win.start, fst[,1], type ="n", las = 1, ylab = expression(F[st]), xlab = "SNP Position", ylim = c(0,0.4))
for (i in 1:length(slide@populations)) {
lines(win.start, fst[,i], type = "l", col = pop.group[pop.names[i],4])
}
arch.coords=c(127982050, 127992931)
abline(v = arch.coords, col = "orange")
#rect(xleft = arch.coords[1], ybottom = -1, xright = arch.coords[2], ytop = 0.5, border = "transparent", col = 2)
Velvet
Cleaning
../../bbmap/bbduk.sh -Xmx1g
in1=WGC067462_hunhewenku_509_combined_R1.fastq # gz file works as well
in2=WGC067462_hunhewenku_509_combined_R2.fastq # gz file works as well
-out1=clean1.fq
-out2=clean2.fq
qtrim=rl
trimq=20
Running Velvet Optimizer
srun ../../VelvetOptimiser-2.2.5/VelvetOptimiser.pl
--t 32
--s 31 --e 31 --x 6 # kmer sizes
-f '-shortPaired -fastq clean1.fq -shortPaired2 -fastq clean2.fq'
-t 4
--optFuncKmer ‘n50’
-p prefix
PacBio assembly with canu
./canu -p staph-auto-5 -d staph-auto-5 genomeSize=2.2m -pacbio-raw pac-reads-5.tar.gz