Mini-Tutorals: Difference between revisions

From QiuLab
Jump to navigation Jump to search
imported>Lab
 
(112 intermediate revisions by 4 users not shown)
Line 1: Line 1:
=An R function to transform a wide data frame to a long one==
=Weblogo=
<code> df2long = function(x, varname, cname, rname){
* install weblogo from [https://github.com/WebLogo/weblogo github]
* Or install with conda: <code>conda install -c conda-forge weblogo</code>
* Or install with pip: <code>pip install weblogo</code>
* run the following command
<syntaxhighlight lang="bash">
weblogo -f IR4-pep.fas -o ir4-logo.pdf -D fasta -F pdf -A protein -s large -t "IR4" -c chemistry
</syntaxhighlight>
=Tree visual with ggtree=
<syntaxhighlight lang="sas">
setwd("C:/Users/Weigang/Dropbox/Natasha-files/")
library(ggtree)
library(treeio)
library(tidyverse)
 
# read tree
tree <- read.tree("asian_clade3_v2.nwk")
 
# create a tibble of OTU groups
tips <- tibble(id = tree$tip.label,
              origin = c(rep("Eurasia",4), "US", "Eurasia",
                        "US", rep("Eurasia",25)))
 
# plot tree
p <- ggtree(tree) +
  xlim(0,0.02) + # to avoid overflow of labels
  geom_treescale(x=0, y=30) 
# join tree and add group color
p %<+% tips +
  geom_tiplab(aes(color = origin)) +
  scale_color_manual(values = c("Eurasia" = 1,  "US" = 2)) +
  theme(legend.position = "none")
</syntaxhighlight>
 
=Github for sharing data and source code=
* SSH setup (github no longer allows password-based push to remote repository)
# [https://docs.github.com/en/github/authenticating-to-github/connecting-to-github-with-ssh/generating-a-new-ssh-key-and-adding-it-to-the-ssh-agent Create a new SSH key on your computer]
# [https://docs.github.com/en/github/authenticating-to-github/connecting-to-github-with-ssh/adding-a-new-ssh-key-to-your-github-account Add the key to your Github account]
# [https://gist.github.com/xirixiz/b6b0c6f4917ce17a90e00f9b60566278 Commit with SSH]
## Authentication/test: <code>ssh -T git@github.com</code>
## Add repository for commit: <code>git remote set-url origin git@github.com:bioperl/p5-bpwrapper.git</code>
* Developer workflow
** Clone remote repository: <code>git clone <rep-name.git></code>
** Sync local to remote repository: <code>git pull</code>
** Check local repository status: <code>git status</code>
** Show the latest commit: <code>git log</code>
** Add new file: <code>git add <filename> </code>
** Commit changes to local repository: <code>git commit -a -m "message"</code>
** Push to remote repository: <code>git push</code>
 
=bcftools pipeline=
bcftools view -m2 -M2 --types snps calls.bcf > snps.bcf
vcftools --vcf snps4.vcf --maf 0.01 --recode --recode-INFO-all --out maf (n=33 SNPs)
bcftools query -l  maf.recode.vcf > samples (n=17775 samples)
cat samples | while read line; do echo ">$line"; bcftools query -s "$line" -f '[%TGT]' maf.recode.vcf; echo; done > samples-maf.fas
 
=R tips=
==Add expected normal curvce to histogram==
 
<syntaxhighlight lang='bash'>
plot.trait <- function(qt.out) {
  phe <- qt.out[[4]]$phe
  exp.mean <- qt.out[['trait.exp']][1]
  exp.sd <- sqrt(qt.out[['trait.exp']][2])
  x <- seq(min(phe), max(phe), length = 100)
  fun <- dnorm(x, mean = exp.mean, sd = exp.sd)
  hist(phe, probability = T, ylim = c(0, max(fun)), br = 50, las = 1)
  lines(x, fun, col = 2, lwd = 2)
}
</syntaxhighlight>
 
==Run batch contingency table tests==
# Numbered list item
<syntaxhighlight lang='bash'>
 
snp_ct <- snp_long %>%  group_by(POS, geno, virulence) %>%  count()
 
# get pos with < 4 entries:
bad_pos <- snp_ct %>% group_by(POS) %>% count() %>% filter(n < 4)
 
library(broom)
 
# batch fisher's exact test
snp_fisher <- snp_long %>%
    filter(!POS %in% bad_pos$POS) %>% 
    group_by(POS) %>% 
    do(tidy(xtabs(~geno + virulence, data = .) %>%
        fisher.test(simulate.p.value = T)))
 
snp_fisher <- snp_fisher %>% mutate(y = -log10(p.value))
 
# plot vocano
snp_fisher %>%
    ggplot(aes(x = estimate, y = y)) + 
    geom_point(shape = 1, alpha = 0.5) + 
    scale_x_log10() +
    geom_vline(xintercept = 1, linetype = 2, color = "red") + 
    theme_bw() + 
    xlab("odds ratio (log10)") + 
    ylab("signficance (-log10[p])")
 
</syntaxhighlight>
 
 
==multiple regression models (with broom::tidy)==
<syntaxhighlight lang='bash'>
library(broom)
compensation.models <- compensation %>% group_by(Grazing) %>% do(tidy(lm(Fruit ~ Root, data = .))) %>% filter(term != '(Intercept)')
</syntaxhighlight>
 
==multiple regression models (with nested tibble)==
<syntaxhighlight lang='bash'>
# build multiple models, one for each serum:
by_serum <- two_od %>% group_by(Serum) %>% nest() # nested data frame, one row per serum
x_model <- function(df) { # model function (similar to lapply)
  x <- df %>% filter(OspC!='A02' & OspC!='A04')
#  x <- df %>% filter(OspC!='A02' & OspC!='A04' & OspC!='N14')
  lm(OD1 ~ OD2, data = x)
}
 
by_serum <- by_serum %>% mutate(model = map(data, x_model)) # add model to each serum
 
output <- vector("list")
for(i in 1:length(by_serum$Serum)) {
  df <- by_serum$data[[i]]
  model <- by_serum$model[[i]]
  pd <- predict.lm(model, newdata = data.frame(OD2 = df$OD2))
  output[[i]] <- data.frame(OspC = df$OspC,
                            OD2 = df$OD2,
                            OD1 = df$OD1,
                            OD1_pred = pd,
                            Serum = rep(by_serum$Serum[i], nrow(df))
                                        )
}
df.out <- bind_rows(output)
</syntaxhighlight>
 
==multiple regression models (with custum functions)==
<syntaxhighlight lang='bash'>
# build model
models <- xy %>% split(.$Serum) %>% map(~lm( aff_PL ~ aff_C3H, data= .))
 
# get r-squared:
models %>% map(summary) %>% map_dbl("adj.r.squared")
 
# get p values (of slope, using a custom function):
get.pvalue <- function(x){
  s <- summary(x);
  s$coefficients[2, 4]
}
models %>% map(get.pvalue) %>% unlist()
 
# get slope:
get.slope <- function(x){
  s <- summary(x);
  s$coefficients[2, 1]
}
models %>% map(get.slope) %>% unlist()
 
# get intercept:
get.intercept <- function(x){
  s <- summary(x);
  s$coefficients[1, 1]
}
models %>% map(get.intercept) %>% unlist()
</syntaxhighlight>
 
=Microbial genome alignment with MUMMER4 and other tools=
==SARS-CoV-2 GISAID pipleline==
<pre>
#######################################
# Parse GISAID sequences fasta
######################################
1. bioseq -B cov.fas # burst into individual files
1a. move the un-bursted file out the "human-files" directory!!!
mv cov-human.fas ../
 
2. sam align (weigang@wallace:~/cov-03-09-2030/host-human$ for f in *.fas; do ../sam-align.bash $f; done )
    nucmer --sam-long=COH1 B111.fa COH1.fa
    samtools view -b COH1.sam -T B111.fa > COH1.bam
    samtools sort COH1.bam -o COH1.sorted.bam
    samtools index COH1.sorted.bam
 
    with script: for f in *.fas; do ../sam-align.bash $f; done
 
3c. Less strict call: bcftools mpileup -Ou -f ../ref.fas  *.sorted.bam | bcftools call -mv --ploidy-file ploidy.txt  -Ob -o calls.bcf -P 0.05 (or -P 0.1; large P value for less strict call, default 1.1e-3)
# 3b. bcftools mpileup -Ou -f ../ref.fas  *.sorted.bam | bcftools call -mv --ploidy-file ploidy.txt  -Ob -o calls.bcf, with default ploidy as 1:
weigang@wallace:~/cov-03-09-2020/cov57$ cat ploidy.txt
*      *      *      M      1
 
6a. bcftools view -m2 -M2 --types snps calls.bcf > snps.bcf ( get only biallelic SNPs)
6b. vcftools --vcf input_file.vcf --remove-indels --recode --recode-INFO-all --out cov.vcf # filter SNPs only
 
7. filter sites by allele counts: only informative sites
bcftools view snps.bcf > snps.vcf
vcftools --vcf snps.vcf --mac 2 --recode --recode-INFO-all --out snps2.vcf
</pre>
 
==Pipeline==
<pre>
######################
Host & Enviroment: wqiu@bioit.hunter.cuny.edu
vcftools & clonalframe "source activate gbs"
bcftools: "source activate ngs"
 
1. align fastq to ref
1) index reference genome
 
bwa index B111.fa
=> 5 files: amb, ann, bwt, pac, sa
 
2) align reads to ref
ls -1 *.gz | cut -d'.' -f1 | sort -u > sample.list
 
byobu
 
cat sample.list | while read line; do bwa mem B111.fa ${line}.R1.fastq.gz ${line}.R2.fastq.gz > $line.sam; done
=> sam file
 
 
2. convert to bam, sort by ref coordinates
 
cat sample.list | while read line; do samtools view -F 4 -Sbh ${line}.sam | samtools sort -o ${line}.bam; done
=> acc.bam
 
 
1+2: wrapped to save space
cat sample.list | while read line; do
begin=$(date +"%D:%H:%M")
echo -ne "$line ... started $begin ...";
bwa mem B111.fa ${line}.clean.1.fastq.gz ${line}.clean.2.fastq.gz > $line.sam 2> /dev/null;
bwa_time=$(date +"%D:%H:%M")
echo -ne "sam file generated: $line.sam on $bwa_time ...";
samtools view -F 4 -Sbh ${line}.sam | samtools sort -o ${line}.bam 2> /dev/null;
bam_time=$(date +"%D:%H:%M")
echo -ne "bam file generated: $line.bam on $bam_time ...";
rm $line.sam;
echo "sam file deleted. Done";
done
 
 
 
3. index
cat sample.list | while read line; do samtools index ${line}.bam; done
=> acc.bam.bai
 
 
 
4. Variant Call
 
1) make ploidy.txt file
cat > ploidy.txt
*      *      *      M      1
 
2) pileup and make bcf file
bcftools mpileup -Ou -f B111.fa *.bam | bcftools call -mv --ploidy-file ploidy.txt -Ob -o calls.bcf -P 0.05
((or -P 0.1; large P value for less strict call, default 1.1e-3))
=> calls.bcf
 
 
 
5. analyse calls.bcf; remove outlier SNPs and samples
1) check stats
bcftools stats calls.bcf > call.stats
ts/tv (transition/transversion) better greater than 3
check AF freq, remove the first class if necessary (e.g., vcf --maf 0.008)
 
check counts:
vcftools --vcf snps.vcf --counts --out snps
 
Filter by maf counts (at least 2):
vcftools --vcf snps-255.recode.vcf --recode --recode-INFO-all --out snps2 --mac 2
 
 
2) get only biallelic SNPs
bcftools view -m2 -M2 --types snps calls-87.bcf > snps-87.vcf
bcftools stats snps.vcf | ll
 
bcftools view -m2 -M2 --types snps call-49.vcf > snps-49.vcf
 
3) merge samples
bgzip snps-87.vcf
bgzip snps-49.vcf
=> vcf.gz file
 
bcftools index snps-87.vcf.gz
bcftools index snps-49.vcf.gz
=> vcf.gz.csi file
 
bcftools merge -Ob snps-87.vcf.gz snps-49.vcf.gz > snps-136.bcf
 
get only biallelic SNPs
bcftools view -m2 -M2 --types snps snps-136.bcf > snps-136.vcf
ln -s snps-136.vcf snps.vcf
 
# use vcftools to remove invariant sites, in case samples are dropped:
vcftools --gzvcf snps2.vcf.gz --non-ref-ac-any 1 --recode --recode-INFO-all --out tmp --remove-indv IND
 
6. vcf to fasta
1) modify sample name (should use "bcftools reheader -s change-id.txt -o tmp.vcf snps.vcf")
cat snps.vcf | sed 's/.bam//g; s/.sorted//g; s/batch-..//g' > tmp.vcf
mv tmp.vcf snps-136.vcf
 
2) sample list
bcftools query -l snps.vcf > sample.txt
 
3) make fasta: could be gzipped
cat sample.txt | while read line; do echo ">$line"; bcftools query -s "$line" -f '[%TGT]' snps.vcf; echo; done > sample.fas
Note: . in fas file means missing nt, handled correctly iqtree
 
4) get ref seq
echo ">B31" > ref.fas
ll snps.vcf => get ref id
grep "^Chromosome" snps.vcf | cut -f4 | paste -s -d '' - >> ref.fas
#grep "^CP021772" snps.vcf | cut -f4 | paste -s -d '\0' - >> ref.fas (#if on apple system)
 
cat ref.fas >> sample.fas
=> 137 samples, 46783 snps
 
go to genometracker:
scp sample.fas snps-136.vcf weigang@genometracker.org:/mnt/bac_genome/GBS-Wu-Oct-2022/.
 
on genometracker:
ln -s snps-136.vcf snps.vcf
 
# subset samples (e.g., bbss only):
bcftools view -Ov -S samples-bbss.txt --force-samples snp5-lp17.vcf > snp5-lp17-bbss.vcf
 
 
7. make tree (with iqtree, for SNP-only alignment, with bootstrap; can't have constant sites)
iqtree2 -s sample.fas -m GTR+ASC -B 1000 -nt AUTO (-n AUTO to use all CPUs on the cluster)
 
# if containing full seq alignment with constant sites, e.g., HIV env gene alignment
# iqtree -s $work_dir/test.fas -m GTR+F+G -B 1000 --redo -nt AUTO
=> sample.fas.contree
 
 
8. consequence call
1) download ref in genbank format
NCBI nucleotide database: search CP021772
=> B111.gb
 
2) convert genbank to gff3 format
(1) py file: modified from https://dmnfarrell.github.io/bioinformatics/bcftools-csq-gff-format
=> gb2gff3.py
(2) ./gb2gff3.py B111.gb
=> B111.gff3
(3) change acc to match ref name in snps.vcf
cat B111.gff3 | sed 's/^CP021772.1/CP021772/' > tmp.gff3
mv tmp.gff3 B111.gff3
 
3) call consequence (to tsv or bcf files)
bcftools csq -f B111.fa -g B111.gff3 snps.vcf -Ot -o snps-csq.tsv
 
cut -f2,5,6 snps-csq.tsv | tr '|' '\t' | cut -f1-5,7- > snps-csq2.tsv
 
9. web development
1) orf info from genbank
perl get-orf-by-acc.pl CP021772 > orf.csv
 
2)
scp weigang@genometracker.org:/mnt/bac_genome/GBS-Wu-Oct-2022/sample.fas.contree .
#biotree -m sample.fas.contree | biotree --ref 'B111' > sample.dnd
#biotree -r 'GBS02' sample.fas.contree | biotree --ref 'B111' > sample.dnd
#biotree -E 0.005 sample.fas.contree | biotree -r 'GBS02' | biotree --ref 'B111' > sample.dnd
biotree -D 90 sample.fas.contree | biotree -r 'GBS02' | biotree --ref 'B111' > tree.dnd
 
tree visualization by R or PhyloView or R/ggtree
 
3)
scp weigang@genometracker.org:/mnt/bac_genome/GBS-Wu-Oct-2022/snps-csq2.tsv .
 
4) pheno.csv
 
</pre>
 
== clonal frame pipeline (WGS)==
# Reconstitute whole-genome alignment from vcf; following this post: https://gatk.broadinstitute.org/hc/en-us/community/posts/360070002671-converting-multisample-vcf-to-fasta. Algorithm: split vcf into single-sample vcf's; use "bcftools consensus" to obtain alternate ref seqs. Environment: conda activate ngs (on bioit)
## create one-sample vcf's
cat ../samples-bbss.txt | while read line; do bcftools view -Ov -s $line ../bbss-cp26.vcf.gz > snp5-cp26-$line.vcf; done
## bgzip & tabit
for f in *.vcf; do bgzip $f; tabix $f.gz; done (or bcftools index)
for f in snp5-lp54-*.vcf; do bgzip $f; bcftools index $f.gz; done
## Get consensus
From doc: "Create consensus sequence by applying VCF variants to a reference fasta file. By default, the program will apply all ALT variants to the reference fasta to obtain the consensus sequence. Using the --sample (and, optionally, --haplotype) option will apply genotype (haplotype) calls from FORMAT/GT."
tail -27 ../samples-bbss.txt | while read line; do cat ../../B31-files/cp26_plasmid.fasta | bcftools consensus --sample $line snp5-cp26-$line.vcf.gz | sed "s/>cp26/>$line/" > bbss-cp26-$line.fa; done
## Add reference:
cat ../../B31-files/cp26_plasmid.fasta | sed "s/>cp26/>B31/" > bbss-cp26-B31.fa
 
concatenate & check length:
cat *.fa > wgs-cp26.fasta
bioseq -l wgs-cp26.fasta
 
check with bioaln for variability
conda activate gbs
bioaln -i 'fasta' -m wgs-cp26.fasta | less
 
# run ClonalFrame (conda env "gbs")
srun ClonalFrameML ss-cp26.dnd wgs-cp26.fasta wgs_cp26 -kappa 4.00 -emsim 100
 
 
== Align two assemblies==
# With minimap2: https://github.com/lh3/minimap2
<code>minimap2 -ax asm5 ref.fa asm.fa > aln.sam      # assembly to assembly/ref alignment</code><br>
"For cross-species full-genome alignment, the scoring system needs to be tuned according to the sequence divergence."<br>
"-au options: asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5% sequence divergence"
# With MUMMER4 ([https://mummer4.github.io MUMMER4]): align two genomes, output delta file:
## nucmer -p A909-COH1 COH1.fa A909.fa
## dnadiff -p A909-COH1 -d A909-COH1.delta
## less A909-ref.report, OR <code> grep AvgIdentity A909-ref.report </code>
## output sam file
## nucmer --sam-long=COH1 B111.fa COH1.fa
## samtools view -b COH1.sam -T B111.fa > COH1.bam
## samtools sort COH1.bam -o COH1.sorted.bam
## samtools index COH1.sorted.bam
# Use GSAlign
<pre>
conda config --add channels defaults
conda config --add channels bioconda
conda config --add channels conda-forge
conda install gsalign
gsalign index ref.fas cov
gsalign -i cov -q test.fas
</pre>
# bcf annotate with gff file:
<code>bcftools csq -f ../ensembl-B31-downloads/lp54.fa -g ../ensembl-B31-downloads/lp54.gff3 -Ov -o lp54.anno.vcf lp54.vcf<code>
 
# variant call with bcftools: https://samtools.github.io/bcftools/howtos/variant-calling.html
# bcftools cheatsheet: https://gist.github.com/elowy01/93922762e131d7abd3c7e8e166a74a0b
 
=BERT classifier=
* Project start: Winter 2020
* Project team: Saadmanul Islam <Saadmanul.Islam91@myhunter.cuny.edu>
* Tutorial page: https://medium.com/swlh/a-simple-guide-on-using-bert-for-text-classification-bbf041ac8d04
* Data set: bioluminascence
* Goal: cross validation
 
=Estimate LD50 using GLM=
<syntaxhighlight lang='bash'>
## curves
library(dplyr)
library(ggplot2)
library(growthcurver)
library(reshape2)
library(purrr)
setwd("/Users/desireepante/Desktop/Programming")
OD20<-read.csv("OD_0220.csv")
OD220<-OD20[-c(6,11:12,20,26:28, 32:33,45:47),]
od <- filter(OD220, IPTG == 0);
 
#N15
od15d <- mutate(od15d, od.norm = OD/max(OD))
test.1<-glm(min ~ OD, data= od15d, family= "binomial")
ilink<-family(test.1)$linkinv
test.1.pd <- with(od15d,
                  data.frame(min = seq(min(min), max(min),
                                      length = 10)))
test.1.pd <- cbind(test.1.pd, predict(test.1, test.1.pd, type = "link", se.fit = TRUE)[1:2])
test.1.pd <- transform(test.1.pd, Fitted = ilink(fit), Upper = ilink(fit + ( se.fit)),
                      Lower = ilink(fit - ( se.fit)))
 
test.test<-ggplot(od15d, aes(x = min, y = od.norm)) +
  geom_ribbon(data = test.1.pd, aes(ymin = Lower, ymax = Upper, x = min),
              fill = "steelblue2", alpha = 0.2, inherit.aes = FALSE) +
  geom_line(data = test.1.pd, aes(y = Fitted, x = min)) +
  geom_point()
test.test
</syntaxhighlight>
=1k genome project/VCFTools=
<syntaxhighlight lang="bash">
# Extract APOL1 locus
vcftools --gzvcf ALL.chr22.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz --from-bp 36253071 --to-bp 36267530 --recode --stdout --recode-INFO-all --chr 22 | gzip -c > apol1.vcf.gz
 
# allele frequencies/counts
vcftools --vcf pvt1.recode.vcf --keep AFR.supergroup --counts --out afr
 
# pi
 
# Fst
vcftools --weir-fst-pop AFR.supergroup --weir-fst-pop AMR.supergroup --vcf pvt1.recode.vcf --out AFR-AMR  --remove-indels
vcftools --weir-fst-pop AFR.supergroup --weir-fst-pop EAS.supergroup --vcf pvt1.recode.vcf --out AFR-EAS  --remove-indels
vcftools --weir-fst-pop AFR.supergroup --weir-fst-pop EUR.supergroup --vcf pvt1.recode.vcf --out AFR-EUR  --remove-indels
vcftools --weir-fst-pop AFR.supergroup --weir-fst-pop SAS.supergroup --vcf pvt1.recode.vcf --out AFR-SAS  --remove-indels
vcftools --weir-fst-pop AMR.supergroup --weir-fst-pop EAS.supergroup --vcf pvt1.recode.vcf --out AMR-EAS  --remove-indels
vcftools --weir-fst-pop AMR.supergroup --weir-fst-pop EUR.supergroup --vcf pvt1.recode.vcf --out AMR-EUR  --remove-indels
vcftools --weir-fst-pop AMR.supergroup --weir-fst-pop SAS.supergroup --vcf pvt1.recode.vcf --out AMR-SAS  --remove-indels
vcftools --weir-fst-pop EAS.supergroup --weir-fst-pop EUR.supergroup --vcf pvt1.recode.vcf --out EAS-EUR  --remove-indels
vcftools --weir-fst-pop EAS.supergroup --weir-fst-pop SAS.supergroup --vcf pvt1.recode.vcf --out EAS-SAS  --remove-indels
vcftools --weir-fst-pop EUR.supergroup --weir-fst-pop SAS.supergroup --vcf pvt1.recode.vcf --out EUR-SAS  --remove-indels
 
# concatenate vcf from different chromosomes
vcf-concat apol1-exon-6-snps.vcf tvp23-exon-1-snps.vcf hpr-exon-5-snps.vcf > 3-exons.vcf
 
# vcf to tabular format
bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT[\t%SAMPLE=%GT]\n' 3-exons.vcf > tmp
 
vcf-query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT[\t%SAMPLE=%GT]\n' 3-exons.vcf > tmp
</syntaxhighlight>
 
=Ggplot2 tips=
* Data binning and nice histograms with density and boxplot:
[https://www.jdatalab.com/data_science_and_data_mining/2017/01/30/data-binning-plot.html R binning]
 
<syntaxhighlight lang="bash">
# numerical vector as factor so that it is sorted numerically (no need for as.character())
x.df$train.size <- as.factor(x.df$train.size)
 
# jitter by group with "position_jitterdodge()"
x.df %>%
  ggplot(aes(x=rate.cat, y=accuracy, color=group)) + 
  geom_boxplot(position=position_dodge(0.8)) +
  geom_jitter(position=position_jitterdodge(0.8)) +
  facet_wrap(~train.size, nrow = 1) + ylim(0,1) +
  geom_abline(intercept = 0.5, slope = 0, linetype="dashed") +
  theme(legend.position = "bottom")
 
# line plot with multiple factors with "interaction()"
rec.df %>% 
  ggplot(aes(x = tissue, y = rec.rate)) +
  geom_boxplot(outlier.shape = NA, aes(fill = tissue), alpha = 0.5) +
  geom_point(size = 3, alpha = 0.5, aes(color = symptom)) +
  geom_line(aes(group = interaction(id, timepoint)), linetype = 2) +
  theme_bw() +
  scale_fill_manual(values = c("lightgreen", "lightpink")) +
  ylab("Num Rec frags") +
  xlab("tissue") +
  scale_y_log10()
</syntaxhighlight>
 
=Capture results in R/GA example=
<syntaxhighlight lang="bash">
# in general use
lapply(seq_along(x), function(i) {name=names(x}[i])
# to capture names
# save to a data.frame
# bind into a single data.frame using dplyr function:
x.df <-bind_rows(x.rate)
 
library(stringdist)
library(ggplot2)
 
out.fit <- lapply(seq(from = 5,to = 150, by = 5), function (num.epi) {
  num.alle <- 20; # num of antigens
  epi.fit <- function(x) { # fitness defined by the minimum pair diff
    x.list <- split(x, rep(1:num.alle, each=num.epi))
    #  sum(seq_distmatrix(x.list, method = "hamming"))/(num.alle * (num.alle-1) / 2)/num.epi # average pairwise diff
    min(seq_distmatrix(x.list, method = "hamming"))/num.epi # min diff
  }
  epi.ga <- ga(type = "binary", fitness = epi.fit, nBits = num.epi * num.alle)
  c(all = num.alle, epi = num.epi, min.pair.dist = epi.ga@fitnessValue)
})
 
epi.df <- t(as.data.frame(out.fit))
rownames(epi.df) <- 1:nrow(epi.df)
ggplot(as.data.frame(epi.df), aes(x=epi, y=min.pair.dist)) + geom_point() + geom_line()
</syntaxhighlight>
 
=NCI Cloud (Seven Bridges)=
* Documentation: http://www.cancergenomicscloud.org/controlled-access-data/
* [https://docs.cancergenomicscloud.org/blog/programmatically-access-tcga-data-using-the-seven-bridges-cancer-genomics-cloud API Documentation (to access meta-data)]
* Steps (June 20, 2018)
** Create project
** Data -> Data Overview -> Select cancer type -> Data Browser
** Copy files to project
** Go to project, "add filter", filter by "experimental strategy" (RNA-SEQ, miRNA-SEQ)
** To download files: select & Download link; run <code>aria2c -i download-links.txt</code>
** To download meta data: Download manifest
* Steps (better)
# Create project
# Search "Public Apps"
# Load data & Run
* Steps (a little hard to follow)
# Create project
# Data -> Data Overview -> Select cancer type -> Data Browser
# Click File to add property (e.g., "miRNA") -> "Copy Files to Project" -> choose project to add & add a tag to the files
# Go to Project -> Files -> filter files by tag name -> select -> Save
# Choose a public protocol/workflow (e.g., RNA-SEQ alignment)
# Add index and GTF files
# Go  to Project -> task -> run
 
=Reloop a circular plasmid from PacBio assembly=
# Input file: Z9-cp26.fa (containing overlapping ends); b31-cp26.fa (reference)
# Find overlap ends by blasting against itself: blastn -task megablast -query Z9-cp26.fa -subject Z9-cp26.fa -evalue 1e-100 -outfmt 6
# The result says “1 -7617” is the same as  “26489-34118”, so we use bioseq to remove overlapping part: bioseq –s”1,26488” Z9-cp26.fa > Z9-cp26.fa2
# Run numcer against b31-cp26 to identify the starting position: nucmer --coord ../b31-cp26.fa Z9-cp26.fa2
# The result says the first base of B31 corresponds to 22171 at Z9-cp26, so we run bioseq to reloop: bioseq –R 22171 Z9-cp26.fa2 > Z9-cp26.fa3
# Run numcer to confirm: nucmer --coord ../b31-cp26.fa Z9-cp26.fa3
 
=KEGG REST API=
<syntaxhighlight lang=python">
# Author: Edgaras Bezrucenkovas
# Date: Nov 12, 2017
#KEGG API
#Obtain module pathways and orthologs from KEGG pa14 database
 
#Import module to fetch the data from KEGG database
import urllib.request
import re            #Regular expressions
 
 
#Get PA14 modules
kegg_db = "http://rest.kegg.jp/link/module/pau"
pa14_mdata = urllib.request.urlopen(kegg_db).read().decode('utf-8').splitlines()
pa14_mods = []
for line in pa14_mdata:
    line = line.split("\t")
    mod = line[1].strip("md:pau_")
    pa14_mods.append(mod)
 
#Get PA14 genes
kegg_db = "http://rest.kegg.jp/link/pau/module"
pa14_gdata = urllib.request.urlopen(kegg_db).read().decode('utf-8').splitlines()
pa14_genes = []
for line in pa14_gdata:
    gdata = line.split("\t")
    gdata[0] = gdata[0].strip("md:pau_")
    gdata[1] = gdata[1].strip("pau:")
    pa14_genes.append(gdata)
   
#Open the file containing KEGG compounds ids
id_file = open("kegg-entries.txt")
kegg_ids = id_file.readlines()
compound_ids = []
for cid in kegg_ids:
    cid = cid.strip("\n")
    compound_ids.append(cid)
id_file.close()
 
#Get module pathway ids
for cid in compound_ids:
    kegg_db = "http://rest.kegg.jp/link/module/compound:"+cid
    module_data = urllib.request.urlopen(kegg_db).read().decode('utf-8').splitlines()
    module_ids = []
    for line in module_data:
        if re.match("cpd", line):
            line = line.split("\t")
            m_id = line[1].strip("md:")
            module_ids.append(m_id)
        else:
            module_ids.append("NA")
           
    #Filter module pathway ids
    modules_final = []
    for m_id in module_ids:
        if m_id == "NA":
            modules_final.append("NA")
        elif m_id in pa14_mods:
            modules_final.append(m_id)
    if not modules_final:
        modules_final.append("NA")
 
    #Get genes
    genes = {}
    for m_id in modules_final:
        key = m_id
        genes.setdefault(key, [])
        if m_id == "NA":
            genes[key].append("NA")
        else:
            for i in pa14_genes:
                if m_id == i[0]:
                    genes[key].append(i[1])
                   
    #Get module names using module ids
    module_names = {}
    for m_id in modules_final:
        if m_id == "NA":
            m_name = {"NA":"NA"}
            module_names.update(m_name)
        else:
            kegg_db = "http://rest.kegg.jp/list/module:"+m_id
            module_data = urllib.request.urlopen(kegg_db).read().decode("utf-8").split("\t")
            module_name = module_data[1].strip("\n")
            m = {m_id:module_name}
            module_names.update(m)
 
    #Output
    for k, v in genes.items():
        for i in v:
            print(cid,k,i,module_names.get(k), sep = "\t")
</syntaxhighlight>
 
=D3js Tutorial=
# Install brackets editor from adobe: http://brackets.io/
# Enable web server & create a directory "public_html" in your home directory
# Create the following file structure: test/index.html; test/js/index.js; test/css/index.css
# Basic D3: following this link
# Metabolomics: following this link
 
=Transform a wide data frame to a long one=
* A function in R
<syntaxhighlight lang="bash">
df2long = function(x, varname, cname, rname){
   num.col = dim(x)[2]
   num.col = dim(x)[2]
   x$tmp = rownames(x)
   x$tmp = rownames(x)
Line 8: Line 700:
   colnames(y)[1] = rname
   colnames(y)[1] = rname
   y
   y
} </code>
}  
</syntaxhighlight>


=Variant verification using IGV=
* A function in Perl
# Prepare & load reference genomes
<syntaxhighlight lang="perl">
## Load FASTA
#!/usr/bin/env perl
# expect a data frame with row and col headings
use strict;
use warnings;
use Data::Dumper;
 
my $first = 0;
my @colnames;
my %data;
while(<>) {
    chomp;
    $first++;
    if ($first == 1) {
        @colnames = split;
        next;
    }
 
    my @data = split;
    my $rowname = shift @data;
    for (my $i=0; $i<=$#data; $i++) {
        $data{$rowname}->{$colnames[$i]} = $data[$i];
    }
}
 
#print Dumper(\%data);
 
foreach my $row (keys %data) {
    foreach my $col (@colnames) {
        print $row, "\t", $col, "\t", $data{$row}->{$col}, "\n";
    }
}
 
exit;
</syntaxhighlight>
 
=Variant call with samtools/bcftools & variant verification using IGV=
# mpileup: <code>samtools mpileup -uf ref.fa sorted1.bam sorted2.bam ... </code>
# [https://samtools.github.io/bcftools/howtos/variant-calling.html call variants]: <code>bcftools call -mv > var.raw.vcf </code> OR: <code>bcftools call -c -v --ploidy 1 gbs-50.mpileup > raw.vcf </code>
# extract only SNP sites: <code>vcftools --vcf input_file.vcf --remove-indels --recode --recode-INFO-all </code>
# filter by quality: <code>bcftools filter -i '%QUAL>200' gbs50-snps.recode.vcf > gbs50-snps2.vcf </code>
# check by TsTv: <code>bcftools stats gbs50-snps2.vcf | grep TSTV </code>
# Extract SNPs into Fasta:
## Or, with bcftools: <code>bcftools query -l gbs50-snps2.vcf > samples</code>
## <code>cat samples | while read line; do echo ">$line"; bcftools query -s "$line" -f '[%TGT]' gbs50-snps2.vcf; echo; done > samples.fas </code>
## get ref seq: <code>echo ">ref" >> sample.fas; grep "^CP" gbs50-snps2.vcf | cut -f4 | paste -s -d '' >> sample.fas</code>
# Visualization with IGB
## Prepare & load reference genomes
## Load FASTA:  <code>bioseq -i'genbank' ref.gb > ref.fas</code>
## Prepare GFF3 file: <code>bp_genbank2gff3.pl --CDS --filter exon --filter gene /home/chongdi/michelle/pa_rp73.gb</code>
## Prepare GFF3 file: <code>bp_genbank2gff3.pl --CDS --filter exon --filter gene /home/chongdi/michelle/pa_rp73.gb</code>
# Prepare BAM files (see below on how to index and align reads using bwa and samtools, in "ospC Amplicon")  
# Prepare BAM files (see below on how to index and align reads using bwa and samtools, in "ospC Amplicon")  
# Load BAM files (multiple bam files okay)
# Index sorted bam files with samtools & load BAM files (multiple bam files okay)


=KRAKEN=
=KRAKEN=
# assigns taxonomic labels to short DNA reads by examining the k-mers within a read and querying a database with those k-mers: <code>kraken -db /belfer-ebox/projects/old_backup/qiulab/minikraken_20141208 --fastq-input 02015P1_S18_L001_R1_001.fastq.gz 02015P1_S18_L001_R2_001.fastq.gz --gzip-compressed --output outfile --paired </code>
# assigns taxonomic labels to short DNA reads by examining the k-mers within a read and querying a database with those k-mers: <code>kraken -db /belfer-ebox/projects/old_backup/qiulab/minikraken_20141208 --fastq-input 02015P1_S18_L001_R1_001.fastq.gz 02015P1_S18_L001_R2_001.fastq.gz --gzip-compressed --output outfile --paired </code>
# summarize taxonomy: <code> kraken-translate --db /belfer-ebox/projects/old_backup/qiulab/minikraken_20141208 outfile > predict.file </code>
# summarize taxonomy: <code> kraken-translate --db /belfer-ebox/projects/old_backup/qiulab/minikraken_20141208 outfile > predict.file </code>
# count unique strains: <code>pat-8]$ cut -f2 s75.predict |  cut -f10 -d';' | grep "^[a-zA-Z]" | sort | uniq -c |  sort -rn</code>.
# Pick the strain with the highest counts as the reference genome


=PATRIC microbial genome annotation CLI=
=PATRIC microbial genome annotation CLI=
#fetching all genomes from a genus
* Install Debian distribution from github: https://github.com/PATRIC3/PATRIC-distribution/releases/tag/1.015
p3-all-genomes --in genome_name,Borrelia > Borrelia.genomes
* Tutorial: http://tutorial.theseed.org/PATRIC-Tutorials/
p3-get-genome-data -i Borrelia.genomes -e genome_status,complete > Borrelia-complete.genomes
* Fetch all genomes from a genus: <code>p3-all-genomes --in genome_name,Borrelia > Borrelia.genomes</code>; <code>p3-get-genome-data -i Borrelia.genomes -e genome_status,complete > Borrelia-complete.genomes</code>
#retrieves features from coding sequences only: input file must be a list of genome ids with a header!
* Retrieves features from coding sequences only: input file must be a list of genome ids with a header! <code>p3-get-genome-features -i $file  -e feature.feature_type,CDS -a pgfam_id -a patric_id -a plfam_id -a start -a end -a strand -a genome_name -a product -a accession  > ${name}-all-features.txt </code>
p3-get-genome-features -i $file  -e feature.feature_type,CDS -a pgfam_id -a patric_id -a plfam_id -a start -a end -a strand -a genome_name -a product -a accession  > ${name}-all-features.txt
* Retrieve feature sequence: <code>p3-echo -t feature.accession "fig|100177.4.peg.1" | p3-get-feature-sequence --dna</code>
* Submit genomes for annotation
# Log in: p3-login user name/password
# Submit a genome for annotation:
## <code>submit-patric-annotation --scientific-name "Pseudomonas aeruginosa" --taxonomy-id 287 --genetic-code 11 --domain Bacteria --log pa.log /weigang@patricbrc.org/home/wsdir gid_5.fas</code>
## <code>submit-patric-annotation --scientific-name "Streptococcus agalactiae" --taxonomy-id 1311 --genetic-code 11 --domain Bacteria --log sa-1.log /weigang@patricbrc.org/home/wsdir sa-1.fas</code>
## Message: "Uploading gid_5.fas to workspace at /weigang@patricbrc.org/home/wsdir/gid_5/gid_5.fas”
# Create genome groups
## login
## <code>cat trep.complete.ids | p3-put-genome-group "Treponema complete”</code>
## <code>p3-get-genome-group "Treponema complete”</code>


=Parsimony reconstruction using MPR=
=Parsimony reconstruction using MPR=
Line 58: Line 810:
=ospC amplicon identification=
=ospC amplicon identification=
<syntaxhighlight lang="bash">
<syntaxhighlight lang="bash">
# de novo amplicon assembly with cortex_con (quality threshold 10, minimum coverage 100)
~/cortex_con/bin/cortex_con_31 --input cortex-input-file-list --input_format fastq --kmer_size 31 --mem_height 17 mem_width 100 -d cortex-out-2  -q 10 -s 100
# Then run mapsembler to extend on both ends (-t2: contig, -p: prefix, -c: minimum coverage)
~/mapsembler2_pipeline/run_mapsembler2_pipeline.sh -s seq-24r.nuc -r "4_S4_L001_R1_001.fastq.gz 4_S4_L001_R2_001.fastq.gz" -t2 -p map-4 -c 200
#Index nucleotide file:  
#Index nucleotide file:  
bwa index ref.fa
bwa index ref.fa
# may also be necessary to match reads separately
bioseq --break ref.fa
for f in *.fas; do bwa.index $f; done


#align:
#align:
Line 77: Line 819:
samtools view -b sample.sam > sample.bam
samtools view -b sample.sam > sample.bam


#Sort BAM file:
#Sort & index BAM file:
samtools sort sample.bam sample.sorted  
samtools sort -o sample.sorted sample.bam
samtools index sample.sorted
 
# Calculate coverage by bedtools (better, raw read coverages)
Based on the update: https://bioinformatics.stackexchange.com/questions/2318/how-to-count-reads-in-bam-per-bed-interval-with-bedtools
bedtools bamtobed -i sample.sorted > sample.bed # bam to bed
bedtools coverage -b sample.bed -a ospC.bed  > sample.coverage
 
Alternatives (not good):
bedtools coverage -b sample.sorted.bam -a refs.bed -d > sample.bedtools.cov # per-site
bedtools coverage -b sample.sorted.bam -a refs.bed -counts > sample.bedtools.cov # average
 
# Generate VCF file (not needed for coverage)
samtools mpileup -uf ref.fa s18.sorted.bam s56.sorted.bam > test.mpileup
bcftools call -c -v test.mpileup > test.raw.vcf
 
#R plot
</syntaxhighlight>


#Extract Coverage by samtools (not good: capped artificially at depth=8k)
<syntaxhighlight lang="bash">
samtools depth sample.sorted.bam > sample.depth
# BASH script from Tony Fung & Jamie Cabigas (Spring 2019)


# Calculate coverage by bedtools (better, raw read coverages)
#!/usr/bin/env bash
bedtools coverage -abam sample.sorted.bam -b refs.bed -d > sample.bedtools.cov # per-site
 
bedtools coverage -abam sample.sorted.bam -b refs.bed -count > sample.bedtools.cov # average
# ----------------------------------------
# File            : fas-to-vcf.sh
# Author          : Tony Fung & Jamie Cabigas
# Date            : March 21, 2019
# Description    : Batch convert FAS files to VCF files
# Requirements    : samtools 1.9, bcftools 1.9, bwa 0.7.12
# Input          : reference file, format of sequence files, up to 7 sequence id's
# Sample Command  : bash fas-to-vcf.sh ref.fasta fastq ERR221622 ERR221661 ERR221662 ERR221663
# Output          : up to 7 VCF files
# ----------------------------------------
 
if [ $# -lt 3 ]
then
  echo "Usage: bash $0 <ref_file> <format_of_sequence_files> <sequence_id> [<sequence_id_2> <sequence_id_3> etc.]";
  exit;
fi
 
for arg
do
 
  if [ $arg == $1 ]
  then
    ref_file="../$1";
  elif [ $arg == $2 ]
  then
    file_format=$2;
  else
    seq_id=$arg;
    fas_file_1="${seq_id}_1.$file_format";
    fas_file_2="${seq_id}_2.$file_format";
 
    pwd;
    cd $seq_id;
    pwd;
 
    # Index nucleotide file:
    bwa index $ref_file;
 
    # Align:
    bwa mem $ref_file $fas_file_1 $fas_file_2 > $seq_id.sam
 
    # BAM output:
    samtools view -b $seq_id.sam > $seq_id.bam
 
    # Sort & index BAM file:
    samtools sort $seq_id.bam -o $seq_id.sorted.bam
    samtools index $seq_id.sorted.bam
 
    # Generate VCF file:
    # -u is deprecated, use bcftools mpileup instead
    # samtools mpileup -uf $ref_file $seq_id.sorted.bam > $seq_id.mpileup
    bcftools mpileup -f $ref_file $seq_id.sorted.bam > $seq_id.mpileup
    bcftools call -c -v $seq_id.mpileup > $seq_id.raw.vcf
 
    cd ..;
    pwd;
  fi
 
done


#R plot:
library(lattice)
x <- read.table("sample.depth", sep="\t", header=F)
colnames(x) <- c("ref", "pos", "cov")
xyplot(log10(cov) ~ pos|ref, data= x, type = "l", main= "sample")
</syntaxhighlight>
</syntaxhighlight>


Line 272: Line 1,084:
==BLAST+: search("google") for homologs/pariwise alignment==
==BLAST+: search("google") for homologs/pariwise alignment==
==hmmer==
==hmmer==
* link: http://hmmer.org/
* Installation: <code>% conda install -c biocore hmmer </code> # Anaconda
* Build profile: <code>hmmbuild globins4.hmm globins4.sto</code>
* Search: <code>hmmsearch globins4.hmm uniprot_sprot.fasta > globins4.out</code>
==cdhit==
==cdhit==
<syntaxhighlight lang="bash">
<syntaxhighlight lang="bash">
Line 317: Line 1,134:


=Programs for producing phylogeny & phylogenetic analysis=
=Programs for producing phylogeny & phylogenetic analysis=
==IQ-Tree==
* [http://www.iqtree.org/doc/Tutorial Beginner's tutorial]
* [http://www.iqtree.org/doc/Advanced-Tutorial Advanced tutorial]
* Model search (slow!! better done with byobu)
** <code>iqtree -s example.phy -m MFP</code>
* protein tree with fast bootstrap:
** <code>iqtree -s example.phy -m WAG+I+G -bb 1000</code> (version 1.xx; -B for latest version)
* site-specific rates:
** <code>iqtree -s example.phy -rate</code>
** <code>iqtree -s example.phy -wsr</code> (version 1.xx; -t and -m from above to save time)
* ancestral state reconstruction:
** <code>iqtree -s example.phy -asr</code> (-t and -m from above to save time)
==FastTree==
==FastTree==
==PHYLIP==
==PHYLIP==
Line 341: Line 1,171:
** <code>raxmlHPC-SSE3 -s protein.phy -n A1 -m PROTGAMMAWAS</code> # protein, gamma, Whelan & Goldman (2001) model
** <code>raxmlHPC-SSE3 -s protein.phy -n A1 -m PROTGAMMAWAS</code> # protein, gamma, Whelan & Goldman (2001) model
** <code>raxmlHPC-SSE3 -s protein.phy -n A2 -m PROTGAMMAGTR</code> # protein, gamma, user model
** <code>raxmlHPC-SSE3 -s protein.phy -n A2 -m PROTGAMMAGTR</code> # protein, gamma, user model
** <code>raxmlHPC-SSE3 -s protein.phy -n A1 -m PROTGAMMAWAS -# 100 -b 0123 </code> # protein, gamma, Whelan & Goldman (2001) model, bootstrap
** <code>raxmlHPC-SSE3 -s protein.phy -n A1 -m PROTGAMMAWAG -# 100 -p 0123 </code> # protein, gamma, Whelan & Goldman (2001) model, bootstrap
** <code>raxmlHPC-SSE3 -s protein.phy -n A1 -m PROTGAMMAWAS -o Carp,Loach</code> # protein, gamma, Whelan & Goldman (2001) model, root on a (monophyletic group)
** <code>raxmlHPC-SSE3 -s protein.phy -n A1 -m PROTGAMMAWAG -o Carp,Loach</code> # protein, gamma, Whelan & Goldman (2001) model, root on a (monophyletic group)
** <code>raxmlHPC-SSE3 -s protein.phy -n A1 -m PROTGAMMAWAS -# autoFC -b 0123 </code> # protein, gamma, Whelan & Goldman (2001) model, bootstrap (at least 99 splits, auto-stopping)
** <code>raxmlHPC-SSE3 -s protein.phy -n A1 -m PROTGAMMAWAS -# autoFC -p 0123 </code> # protein, gamma, Whelan & Goldman (2001) model, bootstrap (at least 99 splits, auto-stopping)
** <code>raxmlHPC-SSE3 -f a -s protein.phy -n A1 -m PROTGAMMAWAG -# 100 -p 0123 -x 0123 </code> # Rapid bootstrap with consensus
** <code>raxmlHPC-SSE3 -f a -s protein.phy -n A1 -m PROTGAMMAWAG -# 100 -p 0123 -x 0123 </code> # Rapid bootstrap with consensus
*** output 1, RAXML_bestTree.A1 (ml tree)
*** output 1, RAXML_bestTree.A1 (ml tree)
Line 353: Line 1,183:
=R packages for phylogenetics=
=R packages for phylogenetics=
==APE==
==APE==
===Convert a tree to ultrametric using time estimates===
<code>
t <- read.tree("core-tree-30otus.dnd")
t.ultra <- chronopl(t, 0)
t.hclust <- as.hclust(t.ultra)
t.dendro <- as.dendrogram(t.hclust)
heatmap(ge.var, Rowv=t.dendro) # order the rows with customized tree
</code>
==phengorn==
==phengorn==
==phytools==
==phytools==


=Population genetics=
=Population genetics=
==LDHat: test of recombination based on 4-gametes==
*  filter sites: <code>convert -seq snps.sites -loc snps.locs -freqcut 0.08 </code>
* Pairwise LD tests: First generate "lookup" table: <code>lkgen -lk /usr/local/LDhat/lk_files/lk_n50_t0.001 -nseq 48</code>; then calculate pairwise LD statistics: <code>pairwise -seq sites.txt -loc locs.txt -lk new_lk.txt</code>
* Run interval for recombination hot spots: <code>interval -seq sites.txt -loc locs.txt -lk new_lk.txt -its 1000000 -bgen 5 -exact</code>
* Get stats: <code> stat -input rates.txt -loc locs.txt -burn 50 (burnin=100K)</code>
* Plot in R:
<syntaxhighlight lang = "bash">
x = read.table("rates.txt", skip=1, fill=T);
x = as.matrix(x);
burn.in = 50;
low = as.integer(nrow(x)*burn.in/100);
means<-apply(x[low:nrow(x),], 2, mean, na.rm=T);
q.95<-apply(x[low:nrow(x),], 2, quantile, probs=c(0.025, 0.5, 0.975), na.rm=T);
pos<-as.vector(as.matrix(read.table("locs.txt", as.is=T, skip=1)));
plot(pos[1:(length(pos)-1)], y=means[2:length(means)], type="s", col=rgb(0,0,0.5),
    xlab="SNP position (B111)", ylab="Posterior mean recombination rate", main="GBS Strep: recombination by LDhat");
lines(pos[1:(length(pos)-1)], y=q.95[1,2:length(means)], type="s", col=grey(0.75), lty="dotted");
lines(pos[1:(length(pos)-1)], y=q.95[3,2:length(means)], type="s", col=grey(0.75), lty="dotted");
</syntaxhighlight>
==ms: coalescence simulation==
==ms: coalescence simulation==
==SFS: forward simulation==
==SFS: forward simulation==
Line 594: Line 1,457:
* Turn Into VCF with reference
* Turn Into VCF with reference
Make a sample list file (from bubble or multicolor log file):
Make a sample list file (from bubble or multicolor log file):
<code>cat e1-bubbles-in-sample.log | grep CLEANED | cut -f2 > e1.sample.lis</code>
<code>cat e1-bubbles-in-sample.log | grep CLEANED | cut -f2 > e1.sample.lis</code>


Customize the following command based on your output files, num of colors, index of ref colors, etc
Customize the following command based on your output files, num of colors, index of ref colors, etc
<code>
<code>
perl /home/weigang/CORTEX_release_v1.0.5.21/scripts/analyse_variants/process_calls-wq.pl --callfile e1-bubbles-in-sample.out --callfile_log e1-bubbles-in-sample.log -outvcf e1-bubbles-in-sample --outdir e1-vcfout --samplename_list e1.sample.list --num_cols 7 --stampy_bin /home/weigang/stampy-1.0.28/stampy.py --stampy_hash ref --refcol 6 --vcftools_dir /usr/local/bin --caller BC --kmer 31 --ploidy 1
perl /home/weigang/CORTEX_release_v1.0.5.21/scripts/analyse_variants/process_calls-wq.pl --callfile e1-bubbles-in-sample.out --callfile_log e1-bubbles-in-sample.log -outvcf e1-bubbles-in-sample --outdir e1-vcfout --samplename_list e1.sample.list --num_cols 7 --stampy_bin /home/weigang/stampy-1.0.28/stampy.py --stampy_hash ref --refcol 6 --vcftools_dir /usr/local/bin --caller BC --kmer 31 --ploidy 1

Latest revision as of 05:37, 2 November 2024

  • install weblogo from github
  • Or install with conda: conda install -c conda-forge weblogo
  • Or install with pip: pip install weblogo
  • run the following command
weblogo -f IR4-pep.fas -o ir4-logo.pdf -D fasta -F pdf -A protein -s large -t "IR4" -c chemistry

Tree visual with ggtree

setwd("C:/Users/Weigang/Dropbox/Natasha-files/")
library(ggtree)
library(treeio) 
library(tidyverse)

# read tree
tree <- read.tree("asian_clade3_v2.nwk")

# create a tibble of OTU groups
tips <- tibble(id = tree$tip.label, 
               origin = c(rep("Eurasia",4), "US", "Eurasia", 
                         "US", rep("Eurasia",25))) 

# plot tree
p <- ggtree(tree) + 
  xlim(0,0.02) + # to avoid overflow of labels
  geom_treescale(x=0, y=30)   
 
# join tree and add group color 
p %<+% tips + 
  geom_tiplab(aes(color = origin)) + 
  scale_color_manual(values = c("Eurasia" = 1,  "US" = 2)) + 
  theme(legend.position = "none")

Github for sharing data and source code

  • SSH setup (github no longer allows password-based push to remote repository)
  1. Create a new SSH key on your computer
  2. Add the key to your Github account
  3. Commit with SSH
    1. Authentication/test: ssh -T git@github.com
    2. Add repository for commit: git remote set-url origin git@github.com:bioperl/p5-bpwrapper.git
  • Developer workflow
    • Clone remote repository: git clone <rep-name.git>
    • Sync local to remote repository: git pull
    • Check local repository status: git status
    • Show the latest commit: git log
    • Add new file: git add <filename>
    • Commit changes to local repository: git commit -a -m "message"
    • Push to remote repository: git push

bcftools pipeline

bcftools view -m2 -M2 --types snps calls.bcf > snps.bcf vcftools --vcf snps4.vcf --maf 0.01 --recode --recode-INFO-all --out maf (n=33 SNPs) bcftools query -l maf.recode.vcf > samples (n=17775 samples) cat samples | while read line; do echo ">$line"; bcftools query -s "$line" -f '[%TGT]' maf.recode.vcf; echo; done > samples-maf.fas

R tips

Add expected normal curvce to histogram

plot.trait <- function(qt.out) {
  phe <- qt.out[[4]]$phe
  exp.mean <- qt.out[['trait.exp']][1]
  exp.sd <- sqrt(qt.out[['trait.exp']][2])
  x <- seq(min(phe), max(phe), length = 100)
  fun <- dnorm(x, mean = exp.mean, sd = exp.sd)
  hist(phe, probability = T, ylim = c(0, max(fun)), br = 50, las = 1)
  lines(x, fun, col = 2, lwd = 2)
}

Run batch contingency table tests

  1. Numbered list item
snp_ct <- snp_long %>%   group_by(POS, geno, virulence) %>%   count()

# get pos with < 4 entries:
bad_pos <- snp_ct %>% group_by(POS) %>% count() %>% filter(n < 4)

library(broom)

# batch fisher's exact test
snp_fisher <- snp_long %>% 
     filter(!POS %in% bad_pos$POS) %>%   
     group_by(POS) %>%   
     do(tidy(xtabs(~geno + virulence, data = .) %>% 
        fisher.test(simulate.p.value = T)))

snp_fisher <- snp_fisher %>% mutate(y = -log10(p.value))

# plot vocano
snp_fisher %>% 
    ggplot(aes(x = estimate, y = y)) +   
    geom_point(shape = 1, alpha = 0.5) +   
    scale_x_log10() + 
    geom_vline(xintercept = 1, linetype = 2, color = "red") +  
    theme_bw() +  
    xlab("odds ratio (log10)") +  
    ylab("signficance (-log10[p])")


multiple regression models (with broom::tidy)

library(broom)
compensation.models <- compensation %>% group_by(Grazing) %>% do(tidy(lm(Fruit ~ Root, data = .))) %>% filter(term != '(Intercept)')

multiple regression models (with nested tibble)

# build multiple models, one for each serum:
by_serum <- two_od %>% group_by(Serum) %>% nest() # nested data frame, one row per serum
x_model <- function(df) { # model function (similar to lapply)
  x <- df %>% filter(OspC!='A02' & OspC!='A04')
#  x <- df %>% filter(OspC!='A02' & OspC!='A04' & OspC!='N14')
  lm(OD1 ~ OD2, data = x)
}

by_serum <- by_serum %>% mutate(model = map(data, x_model)) # add model to each serum

output <- vector("list")
for(i in 1:length(by_serum$Serum)) {
  df <- by_serum$data[[i]]
  model <- by_serum$model[[i]]
  pd <- predict.lm(model, newdata = data.frame(OD2 = df$OD2))
  output[[i]] <- data.frame(OspC = df$OspC,
                            OD2 = df$OD2,
                            OD1 = df$OD1,
                            OD1_pred = pd,
                            Serum = rep(by_serum$Serum[i], nrow(df))
                                        )
}
df.out <- bind_rows(output)

multiple regression models (with custum functions)

# build model
models <- xy %>% split(.$Serum) %>% map(~lm( aff_PL ~ aff_C3H, data= .))

# get r-squared:
models %>% map(summary) %>% map_dbl("adj.r.squared")

# get p values (of slope, using a custom function):
get.pvalue <- function(x){
  s <- summary(x); 
  s$coefficients[2, 4]
}
models %>% map(get.pvalue) %>% unlist()

# get slope:
get.slope <- function(x){
  s <- summary(x); 
  s$coefficients[2, 1]
}
models %>% map(get.slope) %>% unlist()

# get intercept:
get.intercept <- function(x){
  s <- summary(x); 
  s$coefficients[1, 1]
}
models %>% map(get.intercept) %>% unlist()

Microbial genome alignment with MUMMER4 and other tools

SARS-CoV-2 GISAID pipleline

#######################################
# Parse GISAID sequences fasta
######################################
1. bioseq -B cov.fas # burst into individual files
1a. move the un-bursted file out the "human-files" directory!!!
mv cov-human.fas ../

2. sam align (weigang@wallace:~/cov-03-09-2030/host-human$ for f in *.fas; do ../sam-align.bash $f; done )
    nucmer --sam-long=COH1 B111.fa COH1.fa
    samtools view -b COH1.sam -T B111.fa > COH1.bam
    samtools sort COH1.bam -o COH1.sorted.bam
    samtools index COH1.sorted.bam

    with script: for f in *.fas; do ../sam-align.bash $f; done

3c. Less strict call: bcftools mpileup -Ou -f ../ref.fas   *.sorted.bam | bcftools call -mv --ploidy-file ploidy.txt  -Ob -o calls.bcf -P 0.05 (or -P 0.1; large P value for less strict call, default 1.1e-3)
# 3b. bcftools mpileup -Ou -f ../ref.fas  *.sorted.bam | bcftools call -mv --ploidy-file ploidy.txt  -Ob -o calls.bcf, with default ploidy as 1:
weigang@wallace:~/cov-03-09-2020/cov57$ cat ploidy.txt
*       *       *       M       1

6a. bcftools view -m2 -M2 --types snps calls.bcf > snps.bcf ( get only biallelic SNPs)
6b. vcftools --vcf input_file.vcf --remove-indels --recode --recode-INFO-all --out cov.vcf # filter SNPs only

7. filter sites by allele counts: only informative sites
bcftools view snps.bcf > snps.vcf
vcftools --vcf snps.vcf --mac 2 --recode --recode-INFO-all --out snps2.vcf

Pipeline

######################
Host & Enviroment: wqiu@bioit.hunter.cuny.edu
vcftools & clonalframe "source activate gbs"
bcftools: "source activate ngs"

1. align fastq to ref
1) index reference genome

bwa index B111.fa
=> 5 files: amb, ann, bwt, pac, sa

2) align reads to ref
ls -1 *.gz | cut -d'.' -f1 | sort -u > sample.list

byobu

cat sample.list | while read line; do bwa mem B111.fa ${line}.R1.fastq.gz ${line}.R2.fastq.gz > $line.sam; done
=> sam file


2. convert to bam, sort by ref coordinates

cat sample.list | while read line; do samtools view -F 4 -Sbh ${line}.sam | samtools sort -o ${line}.bam; done
	=> acc.bam


1+2: wrapped to save space
cat sample.list | while read line; do 
	begin=$(date +"%D:%H:%M")
	echo -ne "$line ... started $begin ..."; 
	bwa mem B111.fa ${line}.clean.1.fastq.gz ${line}.clean.2.fastq.gz > $line.sam 2> /dev/null; 
	bwa_time=$(date +"%D:%H:%M")
	echo -ne "sam file generated: $line.sam on $bwa_time ...";
	samtools view -F 4 -Sbh ${line}.sam | samtools sort -o ${line}.bam 2> /dev/null;
	bam_time=$(date +"%D:%H:%M")
	echo -ne "bam file generated: $line.bam on $bam_time ...";
	rm $line.sam;
	echo "sam file deleted. Done";
	done



3. index
cat sample.list | while read line; do samtools index ${line}.bam; done
	=> acc.bam.bai



4. Variant Call

1) make ploidy.txt file
cat > ploidy.txt
*       *       *       M       1

2) pileup and make bcf file
bcftools mpileup -Ou -f B111.fa *.bam | bcftools call -mv --ploidy-file ploidy.txt -Ob -o calls.bcf -P 0.05
((or -P 0.1; large P value for less strict call, default 1.1e-3))
=> calls.bcf



5. analyse calls.bcf; remove outlier SNPs and samples
1) check stats
bcftools stats calls.bcf > call.stats
ts/tv (transition/transversion) better greater than 3
check AF freq, remove the first class if necessary (e.g., vcf --maf 0.008)

check counts:
vcftools --vcf snps.vcf --counts --out snps

Filter by maf counts (at least 2):
vcftools --vcf snps-255.recode.vcf --recode --recode-INFO-all --out snps2 --mac 2


2) get only biallelic SNPs
bcftools view -m2 -M2 --types snps calls-87.bcf > snps-87.vcf
bcftools stats snps.vcf | ll

bcftools view -m2 -M2 --types snps call-49.vcf > snps-49.vcf

3) merge samples
bgzip snps-87.vcf
bgzip snps-49.vcf
=> vcf.gz file

bcftools index snps-87.vcf.gz
bcftools index snps-49.vcf.gz
=> vcf.gz.csi file

bcftools merge -Ob snps-87.vcf.gz snps-49.vcf.gz > snps-136.bcf

get only biallelic SNPs
bcftools view -m2 -M2 --types snps snps-136.bcf > snps-136.vcf
ln -s snps-136.vcf snps.vcf

# use vcftools to remove invariant sites, in case samples are dropped:
vcftools --gzvcf snps2.vcf.gz --non-ref-ac-any 1 --recode --recode-INFO-all --out tmp --remove-indv IND

6. vcf to fasta
1) modify sample name (should use "bcftools reheader -s change-id.txt -o tmp.vcf snps.vcf")
cat snps.vcf | sed 's/.bam//g; s/.sorted//g; s/batch-..//g' > tmp.vcf
mv tmp.vcf snps-136.vcf

2) sample list 
bcftools query -l snps.vcf > sample.txt

3) make fasta: could be gzipped
cat sample.txt | while read line; do echo ">$line"; bcftools query -s "$line" -f '[%TGT]' snps.vcf; echo; done > sample.fas
Note: . in fas file means missing nt, handled correctly iqtree

4) get ref seq
echo ">B31" > ref.fas
ll snps.vcf => get ref id
grep "^Chromosome" snps.vcf | cut -f4 | paste -s -d '' - >> ref.fas
#grep "^CP021772" snps.vcf | cut -f4 | paste -s -d '\0' - >> ref.fas (#if on apple system)

cat ref.fas >> sample.fas
=> 137 samples, 46783 snps

go to genometracker:
scp sample.fas snps-136.vcf weigang@genometracker.org:/mnt/bac_genome/GBS-Wu-Oct-2022/.

on genometracker:
ln -s snps-136.vcf snps.vcf

# subset samples (e.g., bbss only):
bcftools view -Ov -S samples-bbss.txt --force-samples snp5-lp17.vcf > snp5-lp17-bbss.vcf


7. make tree (with iqtree, for SNP-only alignment, with bootstrap; can't have constant sites)
iqtree2 -s sample.fas -m GTR+ASC -B 1000 -nt AUTO (-n AUTO to use all CPUs on the cluster)

# if containing full seq alignment with constant sites, e.g., HIV env gene alignment
# iqtree -s $work_dir/test.fas -m GTR+F+G -B 1000 --redo -nt AUTO
=> sample.fas.contree


8. consequence call
1) download ref in genbank format
NCBI nucleotide database: search CP021772
	=> B111.gb

2) convert genbank to gff3 format
(1) py file: modified from https://dmnfarrell.github.io/bioinformatics/bcftools-csq-gff-format
	=> gb2gff3.py
(2) ./gb2gff3.py B111.gb
	=> B111.gff3
(3) change acc to match ref name in snps.vcf
cat B111.gff3 | sed 's/^CP021772.1/CP021772/' > tmp.gff3
mv tmp.gff3 B111.gff3

3) call consequence (to tsv or bcf files)
bcftools csq -f B111.fa -g B111.gff3 snps.vcf -Ot -o snps-csq.tsv

cut -f2,5,6 snps-csq.tsv | tr '|' '\t' | cut -f1-5,7- > snps-csq2.tsv

9. web development
1) orf info from genbank
perl get-orf-by-acc.pl CP021772 > orf.csv

2)
scp weigang@genometracker.org:/mnt/bac_genome/GBS-Wu-Oct-2022/sample.fas.contree .
	#biotree -m sample.fas.contree | biotree --ref 'B111' > sample.dnd
	#biotree -r 'GBS02' sample.fas.contree | biotree --ref 'B111' > sample.dnd
	#biotree -E 0.005 sample.fas.contree | biotree -r 'GBS02' | biotree --ref 'B111' > sample.dnd
biotree -D 90 sample.fas.contree | biotree -r 'GBS02' | biotree --ref 'B111' > tree.dnd

tree visualization by R or PhyloView or R/ggtree

3)
scp weigang@genometracker.org:/mnt/bac_genome/GBS-Wu-Oct-2022/snps-csq2.tsv .

4) pheno.csv

clonal frame pipeline (WGS)

  1. Reconstitute whole-genome alignment from vcf; following this post: https://gatk.broadinstitute.org/hc/en-us/community/posts/360070002671-converting-multisample-vcf-to-fasta. Algorithm: split vcf into single-sample vcf's; use "bcftools consensus" to obtain alternate ref seqs. Environment: conda activate ngs (on bioit)
    1. create one-sample vcf's

cat ../samples-bbss.txt | while read line; do bcftools view -Ov -s $line ../bbss-cp26.vcf.gz > snp5-cp26-$line.vcf; done

    1. bgzip & tabit

for f in *.vcf; do bgzip $f; tabix $f.gz; done (or bcftools index) for f in snp5-lp54-*.vcf; do bgzip $f; bcftools index $f.gz; done

    1. Get consensus

From doc: "Create consensus sequence by applying VCF variants to a reference fasta file. By default, the program will apply all ALT variants to the reference fasta to obtain the consensus sequence. Using the --sample (and, optionally, --haplotype) option will apply genotype (haplotype) calls from FORMAT/GT." tail -27 ../samples-bbss.txt | while read line; do cat ../../B31-files/cp26_plasmid.fasta | bcftools consensus --sample $line snp5-cp26-$line.vcf.gz | sed "s/>cp26/>$line/" > bbss-cp26-$line.fa; done

    1. Add reference:

cat ../../B31-files/cp26_plasmid.fasta | sed "s/>cp26/>B31/" > bbss-cp26-B31.fa

concatenate & check length: cat *.fa > wgs-cp26.fasta bioseq -l wgs-cp26.fasta

check with bioaln for variability conda activate gbs bioaln -i 'fasta' -m wgs-cp26.fasta | less

  1. run ClonalFrame (conda env "gbs")

srun ClonalFrameML ss-cp26.dnd wgs-cp26.fasta wgs_cp26 -kappa 4.00 -emsim 100


Align two assemblies

  1. With minimap2: https://github.com/lh3/minimap2

minimap2 -ax asm5 ref.fa asm.fa > aln.sam # assembly to assembly/ref alignment
"For cross-species full-genome alignment, the scoring system needs to be tuned according to the sequence divergence."
"-au options: asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5% sequence divergence"

  1. With MUMMER4 (MUMMER4): align two genomes, output delta file:
    1. nucmer -p A909-COH1 COH1.fa A909.fa
    2. dnadiff -p A909-COH1 -d A909-COH1.delta
    3. less A909-ref.report, OR grep AvgIdentity A909-ref.report
    4. output sam file
    5. nucmer --sam-long=COH1 B111.fa COH1.fa
    6. samtools view -b COH1.sam -T B111.fa > COH1.bam
    7. samtools sort COH1.bam -o COH1.sorted.bam
    8. samtools index COH1.sorted.bam
  2. Use GSAlign
conda config --add channels defaults
conda config --add channels bioconda
conda config --add channels conda-forge
conda install gsalign
gsalign index ref.fas cov
gsalign -i cov -q test.fas
  1. bcf annotate with gff file:

bcftools csq -f ../ensembl-B31-downloads/lp54.fa -g ../ensembl-B31-downloads/lp54.gff3 -Ov -o lp54.anno.vcf lp54.vcf

  1. variant call with bcftools: https://samtools.github.io/bcftools/howtos/variant-calling.html
  2. bcftools cheatsheet: https://gist.github.com/elowy01/93922762e131d7abd3c7e8e166a74a0b

BERT classifier

Estimate LD50 using GLM

## curves
library(dplyr)
library(ggplot2)
library(growthcurver)
library(reshape2)
library(purrr)
setwd("/Users/desireepante/Desktop/Programming")
OD20<-read.csv("OD_0220.csv")
OD220<-OD20[-c(6,11:12,20,26:28, 32:33,45:47),]
od <- filter(OD220, IPTG == 0);

#N15
od15d <- mutate(od15d, od.norm = OD/max(OD))
test.1<-glm(min ~ OD, data= od15d, family= "binomial")
ilink<-family(test.1)$linkinv
test.1.pd <- with(od15d,
                  data.frame(min = seq(min(min), max(min),
                                       length = 10)))
test.1.pd <- cbind(test.1.pd, predict(test.1, test.1.pd, type = "link", se.fit = TRUE)[1:2])
test.1.pd <- transform(test.1.pd, Fitted = ilink(fit), Upper = ilink(fit + ( se.fit)),
                       Lower = ilink(fit - ( se.fit)))

test.test<-ggplot(od15d, aes(x = min, y = od.norm)) +
  geom_ribbon(data = test.1.pd, aes(ymin = Lower, ymax = Upper, x = min),
              fill = "steelblue2", alpha = 0.2, inherit.aes = FALSE) +
  geom_line(data = test.1.pd, aes(y = Fitted, x = min)) +
  geom_point()
test.test

1k genome project/VCFTools

# Extract APOL1 locus
vcftools --gzvcf ALL.chr22.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz --from-bp 36253071 --to-bp 36267530 --recode --stdout --recode-INFO-all --chr 22 | gzip -c > apol1.vcf.gz

# allele frequencies/counts
vcftools --vcf pvt1.recode.vcf --keep AFR.supergroup --counts --out afr

# pi

# Fst
vcftools --weir-fst-pop AFR.supergroup --weir-fst-pop AMR.supergroup --vcf pvt1.recode.vcf --out AFR-AMR  --remove-indels
vcftools --weir-fst-pop AFR.supergroup --weir-fst-pop EAS.supergroup --vcf pvt1.recode.vcf --out AFR-EAS  --remove-indels
vcftools --weir-fst-pop AFR.supergroup --weir-fst-pop EUR.supergroup --vcf pvt1.recode.vcf --out AFR-EUR  --remove-indels
vcftools --weir-fst-pop AFR.supergroup --weir-fst-pop SAS.supergroup --vcf pvt1.recode.vcf --out AFR-SAS  --remove-indels
vcftools --weir-fst-pop AMR.supergroup --weir-fst-pop EAS.supergroup --vcf pvt1.recode.vcf --out AMR-EAS  --remove-indels
vcftools --weir-fst-pop AMR.supergroup --weir-fst-pop EUR.supergroup --vcf pvt1.recode.vcf --out AMR-EUR  --remove-indels
vcftools --weir-fst-pop AMR.supergroup --weir-fst-pop SAS.supergroup --vcf pvt1.recode.vcf --out AMR-SAS  --remove-indels
vcftools --weir-fst-pop EAS.supergroup --weir-fst-pop EUR.supergroup --vcf pvt1.recode.vcf --out EAS-EUR  --remove-indels
vcftools --weir-fst-pop EAS.supergroup --weir-fst-pop SAS.supergroup --vcf pvt1.recode.vcf --out EAS-SAS  --remove-indels
vcftools --weir-fst-pop EUR.supergroup --weir-fst-pop SAS.supergroup --vcf pvt1.recode.vcf --out EUR-SAS  --remove-indels

# concatenate vcf from different chromosomes
vcf-concat apol1-exon-6-snps.vcf tvp23-exon-1-snps.vcf hpr-exon-5-snps.vcf > 3-exons.vcf

# vcf to tabular format
bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT[\t%SAMPLE=%GT]\n' 3-exons.vcf > tmp

vcf-query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT[\t%SAMPLE=%GT]\n' 3-exons.vcf > tmp

Ggplot2 tips

  • Data binning and nice histograms with density and boxplot:

R binning

# numerical vector as factor so that it is sorted numerically (no need for as.character())
x.df$train.size <- as.factor(x.df$train.size)

# jitter by group with "position_jitterdodge()"
x.df %>% 
  ggplot(aes(x=rate.cat, y=accuracy, color=group)) +  
  geom_boxplot(position=position_dodge(0.8)) + 
  geom_jitter(position=position_jitterdodge(0.8)) + 
  facet_wrap(~train.size, nrow = 1) + ylim(0,1) + 
  geom_abline(intercept = 0.5, slope = 0, linetype="dashed") + 
  theme(legend.position = "bottom")

# line plot with multiple factors with "interaction()"
rec.df %>%  
  ggplot(aes(x = tissue, y = rec.rate)) +
  geom_boxplot(outlier.shape = NA, aes(fill = tissue), alpha = 0.5) + 
  geom_point(size = 3, alpha = 0.5, aes(color = symptom)) +
  geom_line(aes(group = interaction(id, timepoint)), linetype = 2) +
  theme_bw() +
  scale_fill_manual(values = c("lightgreen", "lightpink")) +
  ylab("Num Rec frags") +
  xlab("tissue") + 
  scale_y_log10()

Capture results in R/GA example

# in general use
lapply(seq_along(x), function(i) {name=names(x}[i]) 
# to capture names
# save to a data.frame
# bind into a single data.frame using dplyr function:
x.df <-bind_rows(x.rate)

library(stringdist)
library(ggplot2)

out.fit <- lapply(seq(from = 5,to = 150, by = 5), function (num.epi) {
  num.alle <- 20; # num of antigens
  epi.fit <- function(x) { # fitness defined by the minimum pair diff
    x.list <- split(x, rep(1:num.alle, each=num.epi))
    #  sum(seq_distmatrix(x.list, method = "hamming"))/(num.alle * (num.alle-1) / 2)/num.epi # average pairwise diff
    min(seq_distmatrix(x.list, method = "hamming"))/num.epi # min diff
  } 
  epi.ga <- ga(type = "binary", fitness = epi.fit, nBits = num.epi * num.alle)
  c(all = num.alle, epi = num.epi, min.pair.dist = epi.ga@fitnessValue)
})

epi.df <- t(as.data.frame(out.fit))
rownames(epi.df) <- 1:nrow(epi.df)
ggplot(as.data.frame(epi.df), aes(x=epi, y=min.pair.dist)) + geom_point() + geom_line()

NCI Cloud (Seven Bridges)

  1. Create project
  2. Search "Public Apps"
  3. Load data & Run
  • Steps (a little hard to follow)
  1. Create project
  2. Data -> Data Overview -> Select cancer type -> Data Browser
  3. Click File to add property (e.g., "miRNA") -> "Copy Files to Project" -> choose project to add & add a tag to the files
  4. Go to Project -> Files -> filter files by tag name -> select -> Save
  5. Choose a public protocol/workflow (e.g., RNA-SEQ alignment)
  6. Add index and GTF files
  7. Go to Project -> task -> run

Reloop a circular plasmid from PacBio assembly

  1. Input file: Z9-cp26.fa (containing overlapping ends); b31-cp26.fa (reference)
  2. Find overlap ends by blasting against itself: blastn -task megablast -query Z9-cp26.fa -subject Z9-cp26.fa -evalue 1e-100 -outfmt 6
  3. The result says “1 -7617” is the same as “26489-34118”, so we use bioseq to remove overlapping part: bioseq –s”1,26488” Z9-cp26.fa > Z9-cp26.fa2
  4. Run numcer against b31-cp26 to identify the starting position: nucmer --coord ../b31-cp26.fa Z9-cp26.fa2
  5. The result says the first base of B31 corresponds to 22171 at Z9-cp26, so we run bioseq to reloop: bioseq –R 22171 Z9-cp26.fa2 > Z9-cp26.fa3
  6. Run numcer to confirm: nucmer --coord ../b31-cp26.fa Z9-cp26.fa3

KEGG REST API

# Author: Edgaras Bezrucenkovas
# Date: Nov 12, 2017
#KEGG API
#Obtain module pathways and orthologs from KEGG pa14 database

#Import module to fetch the data from KEGG database
import urllib.request
import re             #Regular expressions


#Get PA14 modules
kegg_db = "http://rest.kegg.jp/link/module/pau"
pa14_mdata = urllib.request.urlopen(kegg_db).read().decode('utf-8').splitlines()
pa14_mods = []
for line in pa14_mdata:
    line = line.split("\t")
    mod = line[1].strip("md:pau_")
    pa14_mods.append(mod)

#Get PA14 genes
kegg_db = "http://rest.kegg.jp/link/pau/module"
pa14_gdata = urllib.request.urlopen(kegg_db).read().decode('utf-8').splitlines()
pa14_genes = []
for line in pa14_gdata:
    gdata = line.split("\t")
    gdata[0] = gdata[0].strip("md:pau_")
    gdata[1] = gdata[1].strip("pau:")
    pa14_genes.append(gdata)
    
#Open the file containing KEGG compounds ids
id_file = open("kegg-entries.txt")
kegg_ids = id_file.readlines()
compound_ids = []
for cid in kegg_ids:
    cid = cid.strip("\n")
    compound_ids.append(cid)
id_file.close()

#Get module pathway ids
for cid in compound_ids:
    kegg_db = "http://rest.kegg.jp/link/module/compound:"+cid
    module_data = urllib.request.urlopen(kegg_db).read().decode('utf-8').splitlines()
    module_ids = []
    for line in module_data:
        if re.match("cpd", line):
            line = line.split("\t")
            m_id = line[1].strip("md:")
            module_ids.append(m_id)
        else:
            module_ids.append("NA")
            
    #Filter module pathway ids
    modules_final = []
    for m_id in module_ids:
        if m_id == "NA":
            modules_final.append("NA")
        elif m_id in pa14_mods:
            modules_final.append(m_id)
    if not modules_final:
        modules_final.append("NA")

    #Get genes
    genes = {}
    for m_id in modules_final:
        key = m_id
        genes.setdefault(key, [])
        if m_id == "NA":
            genes[key].append("NA")
        else:
            for i in pa14_genes:
                if m_id == i[0]:
                    genes[key].append(i[1])
                    
    #Get module names using module ids
    module_names = {}
    for m_id in modules_final:
        if m_id == "NA":
            m_name = {"NA":"NA"}
            module_names.update(m_name)
        else:
            kegg_db = "http://rest.kegg.jp/list/module:"+m_id
            module_data = urllib.request.urlopen(kegg_db).read().decode("utf-8").split("\t")
            module_name = module_data[1].strip("\n")
            m = {m_id:module_name}
            module_names.update(m)

    #Output
    for k, v in genes.items():
        for i in v:
            print(cid,k,i,module_names.get(k), sep = "\t")

D3js Tutorial

  1. Install brackets editor from adobe: http://brackets.io/
  2. Enable web server & create a directory "public_html" in your home directory
  3. Create the following file structure: test/index.html; test/js/index.js; test/css/index.css
  4. Basic D3: following this link
  5. Metabolomics: following this link

Transform a wide data frame to a long one

  • A function in R
df2long = function(x, varname, cname, rname){
  num.col = dim(x)[2]
  x$tmp = rownames(x)
  y = reshape(x, varying=1:num.col, v.names=varname, timevar=cname, times=names(x)[1:num.col], direction="long")
  y=y[,-4]
  rownames(y)=NULL
  colnames(y)[1] = rname
  y
}
  • A function in Perl
#!/usr/bin/env perl
# expect a data frame with row and col headings
use strict;
use warnings;
use Data::Dumper;

my $first = 0;
my @colnames;
my %data;
while(<>) {
    chomp;
    $first++;
    if ($first == 1) {
        @colnames = split;
        next;
    }

    my @data = split;
    my $rowname = shift @data;
    for (my $i=0; $i<=$#data; $i++) {
        $data{$rowname}->{$colnames[$i]} = $data[$i];
    }
}

#print Dumper(\%data);

foreach my $row (keys %data) {
    foreach my $col (@colnames) {
        print $row, "\t", $col, "\t", $data{$row}->{$col}, "\n";
    }
}

exit;

Variant call with samtools/bcftools & variant verification using IGV

  1. mpileup: samtools mpileup -uf ref.fa sorted1.bam sorted2.bam ...
  2. call variants: bcftools call -mv > var.raw.vcf OR: bcftools call -c -v --ploidy 1 gbs-50.mpileup > raw.vcf
  3. extract only SNP sites: vcftools --vcf input_file.vcf --remove-indels --recode --recode-INFO-all
  4. filter by quality: bcftools filter -i '%QUAL>200' gbs50-snps.recode.vcf > gbs50-snps2.vcf
  5. check by TsTv: bcftools stats gbs50-snps2.vcf | grep TSTV
  6. Extract SNPs into Fasta:
    1. Or, with bcftools: bcftools query -l gbs50-snps2.vcf > samples
    2. cat samples | while read line; do echo ">$line"; bcftools query -s "$line" -f '[%TGT]' gbs50-snps2.vcf; echo; done > samples.fas
    3. get ref seq: echo ">ref" >> sample.fas; grep "^CP" gbs50-snps2.vcf | cut -f4 | paste -s -d >> sample.fas
  7. Visualization with IGB
    1. Prepare & load reference genomes
    2. Load FASTA: bioseq -i'genbank' ref.gb > ref.fas
    3. Prepare GFF3 file: bp_genbank2gff3.pl --CDS --filter exon --filter gene /home/chongdi/michelle/pa_rp73.gb
  8. Prepare BAM files (see below on how to index and align reads using bwa and samtools, in "ospC Amplicon")
  9. Index sorted bam files with samtools & load BAM files (multiple bam files okay)

KRAKEN

  1. assigns taxonomic labels to short DNA reads by examining the k-mers within a read and querying a database with those k-mers: kraken -db /belfer-ebox/projects/old_backup/qiulab/minikraken_20141208 --fastq-input 02015P1_S18_L001_R1_001.fastq.gz 02015P1_S18_L001_R2_001.fastq.gz --gzip-compressed --output outfile --paired
  2. summarize taxonomy: kraken-translate --db /belfer-ebox/projects/old_backup/qiulab/minikraken_20141208 outfile > predict.file
  3. count unique strains: pat-8]$ cut -f2 s75.predict | cut -f10 -d';' | grep "^[a-zA-Z]" | sort | uniq -c | sort -rn.
  4. Pick the strain with the highest counts as the reference genome

PATRIC microbial genome annotation CLI

  • Install Debian distribution from github: https://github.com/PATRIC3/PATRIC-distribution/releases/tag/1.015
  • Tutorial: http://tutorial.theseed.org/PATRIC-Tutorials/
  • Fetch all genomes from a genus: p3-all-genomes --in genome_name,Borrelia > Borrelia.genomes; p3-get-genome-data -i Borrelia.genomes -e genome_status,complete > Borrelia-complete.genomes
  • Retrieves features from coding sequences only: input file must be a list of genome ids with a header! p3-get-genome-features -i $file -e feature.feature_type,CDS -a pgfam_id -a patric_id -a plfam_id -a start -a end -a strand -a genome_name -a product -a accession > ${name}-all-features.txt
  • Retrieve feature sequence: p3-echo -t feature.accession "fig|100177.4.peg.1" | p3-get-feature-sequence --dna
  • Submit genomes for annotation
  1. Log in: p3-login user name/password
  2. Submit a genome for annotation:
    1. submit-patric-annotation --scientific-name "Pseudomonas aeruginosa" --taxonomy-id 287 --genetic-code 11 --domain Bacteria --log pa.log /weigang@patricbrc.org/home/wsdir gid_5.fas
    2. submit-patric-annotation --scientific-name "Streptococcus agalactiae" --taxonomy-id 1311 --genetic-code 11 --domain Bacteria --log sa-1.log /weigang@patricbrc.org/home/wsdir sa-1.fas
    3. Message: "Uploading gid_5.fas to workspace at /weigang@patricbrc.org/home/wsdir/gid_5/gid_5.fas”
  3. Create genome groups
    1. login
    2. cat trep.complete.ids | p3-put-genome-group "Treponema complete”
    3. p3-get-genome-group "Treponema complete”

Parsimony reconstruction using MPR

# MPR for each column
plot.mpr <- function(column=1) {
  plot.phylo(tr.unrooted, main = paste(colnames(ph.t)[column]))
  tmpr<-MPR(ph.t[,column], tr.unrooted, outgroup = "gid_1311.1320")
  nodelabels(paste("[", tmpr[, 1], ",", tmpr[, 2], "]", sep = "")) 
  tiplabels(ph.t[,column][tr.unrooted$tip.label], adj = -2)
}

# add internal node states
datalist <- data.frame(fam=character(), a=numeric(), b=numeric(), c=numeric())
for (i in 1:ncol(ph.t)) {
#for (i in 1:10) {
  tmpr<-MPR(ph.t[,i], tr.unrooted, outgroup = "gid_1311.1320");
  out<-c(colnames(ph.t)[i],tmpr[,1])
  datalist <- rbind(datalist, data.frame(fam=colnames(ph.t)[i], a=tmpr[1,1], b=tmpr[2,1], c=tmpr[3,1])) 
}
#plotting gains and losses
tr.unrooted <- unroot(tr)
gains<-apply(ph.node.states[,9:15], 2, function(x) length(which(x>0)))
losses<-apply(ph.node.states[,9:15], 2, function(x) length(which(x<0)))
plot(tr.unrooted)
edgelabels(gains, adj = c(0.5,-0.25), col=2, frame= "none")
edgelabels(losses, adj = c(0.5,1.25), col=4, frame= "none")

ospC amplicon identification

#Index nucleotide file: 
bwa index ref.fa

#align:
bwa mem ref.fa sample_read1.fq  sample_read2.fq > sample.sam 

#BAM output: 
samtools view -b sample.sam > sample.bam

#Sort & index BAM file:
samtools sort -o sample.sorted sample.bam 
samtools index sample.sorted

# Calculate coverage by bedtools (better, raw read coverages)
Based on the update: https://bioinformatics.stackexchange.com/questions/2318/how-to-count-reads-in-bam-per-bed-interval-with-bedtools
bedtools bamtobed -i sample.sorted > sample.bed # bam to bed
bedtools coverage -b sample.bed -a ospC.bed  > sample.coverage

Alternatives (not good):
bedtools coverage -b sample.sorted.bam -a refs.bed -d > sample.bedtools.cov # per-site
bedtools coverage -b sample.sorted.bam -a refs.bed -counts > sample.bedtools.cov # average

# Generate VCF file (not needed for coverage)
samtools mpileup -uf ref.fa s18.sorted.bam s56.sorted.bam > test.mpileup 
bcftools call -c -v test.mpileup > test.raw.vcf

#R plot
# BASH script from Tony Fung & Jamie Cabigas (Spring 2019)

#!/usr/bin/env bash

# ----------------------------------------
# File            : fas-to-vcf.sh
# Author          : Tony Fung & Jamie Cabigas
# Date            : March 21, 2019
# Description     : Batch convert FAS files to VCF files
# Requirements    : samtools 1.9, bcftools 1.9, bwa 0.7.12
# Input           : reference file, format of sequence files, up to 7 sequence id's
# Sample Command  : bash fas-to-vcf.sh ref.fasta fastq ERR221622 ERR221661 ERR221662 ERR221663
# Output          : up to 7 VCF files
# ----------------------------------------

if [ $# -lt 3 ]
then
  echo "Usage: bash $0 <ref_file> <format_of_sequence_files> <sequence_id> [<sequence_id_2> <sequence_id_3> etc.]";
  exit;
fi

for arg
do

  if [ $arg == $1 ]
  then
    ref_file="../$1";
  elif [ $arg == $2 ]
  then
    file_format=$2;
  else
    seq_id=$arg;
    fas_file_1="${seq_id}_1.$file_format";
    fas_file_2="${seq_id}_2.$file_format";

    pwd;
    cd $seq_id;
    pwd;

    # Index nucleotide file:
    bwa index $ref_file;

    # Align:
    bwa mem $ref_file $fas_file_1 $fas_file_2 > $seq_id.sam

    # BAM output:
    samtools view -b $seq_id.sam > $seq_id.bam

    # Sort & index BAM file:
    samtools sort $seq_id.bam -o $seq_id.sorted.bam
    samtools index $seq_id.sorted.bam

    # Generate VCF file:
    # -u is deprecated, use bcftools mpileup instead
    # samtools mpileup -uf $ref_file $seq_id.sorted.bam > $seq_id.mpileup
    bcftools mpileup -f $ref_file $seq_id.sorted.bam > $seq_id.mpileup
    bcftools call -c -v $seq_id.mpileup > $seq_id.raw.vcf

    cd ..;
    pwd;
  fi

done

Running a Screen Session

use byobu

  1. start a screen session by typing "byobu"
  2. run commands
  3. Detach by pressing "F6"
  4. Reattach by typing "byobu"
  5. Terminate by typing "exit"

use screen

  • Start a screen session
screen
  • Detach the running mission
ctrl + A + D
  • Show the list of running process
screen -ls
  • Reattach a running process
screen -r ProcessID
  • Terminate a process
ctrl + A
:quit

Bp-utils: sequence, alignment & tree utilities by Qiu Lab

bioseq: sequence/FASTA manipulations

  • Use accession "CP002316.1" to retrieve the Genbank file from NCBI. Save the output (in genbank format) to a file named as "cp002316.gb".
bioseq -f "CP002316.1" -o'genbank' > cp002316.gb
  • Use the above file as input, extract FASTA sequences for each genes and save the output to a new file called "cp002316.nuc". Use this file for the following questions.
bioseq -i "genbank" -F cp002316.gb > cp002316.fas
  • Count the number of sequences.
bioseq -n cp002316.fas
  • In a single command, pick the first 10 sequences and find their length
bioseq -p "order:1-10" cp002316.fas | bioseq –l
  • In a single command, pick the third and seventh sequences from the file and do the 3-frame translation. Which reading frame is the correct on both? Specify
bioseq -p "order:3,7" cp002316.fas | bioseq -t3
  • Find the base composition of the last two sequences
bioseq -p "order:25-26" cp002316.fas| bioseq –c
  • Pick the sequence with id "Bbu|D1_B11|8784|9302|1" and count the number of codons present in this sequence
bioseq -p "id:BbuJD1_B11|8784|9302|1" cp002316.fas | bioseq –C
  • Delete the last 10 sequences from the file and save the output to cp002316-v2.nuc
bioseq -d "order:17-26" cp002316.fas > cp002316-v2.nuc
  • In a single command, pick the first sequence, then get the 50-110 nucleotides and make reverse complement of the sub-sequences
bioseq -p "order:1" cp002316.fas | bioseq -s "50,110" | bioseq –r
  • In a single command, get the first 100 nucleotides of all the sequences present in the file and do 1-frame translation of all sub-sequences.
bioseq -s "1,100" cp002316.fas | bioseq -t1

bioaln: alignment/CLUSTALW manipulations

  • Go to /home/shared/LabMeetingReadings/Test-Data and find the sequence alignment file “bioaln_tutorial.aln”. Name the format of the alignment file. Use it to answer all the questions below.
CLUSTALW
  • Find the length of the alignment.
bioaln -l bioaln_tutorial.aln
  • Count the number of the sequences present in the alignment.
bioaln -n bioaln_tutorial.aln
  • How do you convert this alignment in phylip format? Save the output.
bioaln -o "phylip" bioaln_tutorial.aln > test.phy
  • Pick “seq2, seq5, seq7, seq10” from the alignment and calculate their average percent identity.
bioaln -p "seq2, seq5, seq7, seq10" bioaln_tutorial.aln | bioaln -a
  • Get an alignment slice from “50-140” and find the average identities of the slice for sliding windows of 25.
bioaln -s "50, 140" bioaln_tutorial.aln | bioaln -w "25"
  • Extract conserved blocks from the alignment.
bioaln -B bioaln_tutorial.aln
  • Find the unique sequences and list their ids.
bioaln -u bioaln_tutorial.aln | bioaln -L
  • Extract third sites from the alignment and show only variable sites in match view.
bioaln -T bioaln_tutorial.aln | bioaln -v | bioaln -m
  • Remove the gaps and show the final alignment in codon view for an alignment slice “1-100”.
 bioaln -s "1, 100" bioaln_tutorial.aln | bioaln -g | bioaln -c
  • Add a 90% consensus sequence and then show the final alignment in match plus codon view for an alignment slice “20-80”. (Hint: match view followed by codon view)
bioaln -s "20, 80" bioaln_tutorial.aln | bioaln -C "90" | bioaln -m | bioaln -c

biotree: tree/NEWICK manipulations

biopop: SNP statistics

Homology searching and clustering

BLAST+: search("google") for homologs/pariwise alignment

hmmer

  • link: http://hmmer.org/
  • Installation: % conda install -c biocore hmmer # Anaconda
  • Build profile: hmmbuild globins4.hmm globins4.sto
  • Search: hmmsearch globins4.hmm uniprot_sprot.fasta > globins4.out

cdhit

cdhit -i all.pep -o all.cdhit -c 0.5 -n 3

Options:

  • -i: input file
  • -o: output file
  • -c: percent identity (below which it is considered different families)
  • -n: word length

interproscan

../../software/interproscan/interproscan-5.13-52.0/interproscan.sh -i trep-cdhit.representatives.pep2 -o  trep-representatives.tsv -t p -goterms -pa -f tsv

Documentation page: How to run

Programs for producing multiple alignments

MUSCLE

MUGSY

  • MUGSY bash
#!/bin/sh

export MUGSY_INSTALL=/home/weigang/mugsy_x86-64-v1r2.2
export PATH=$PATH:$MUGSY_INSTALL:$MUGSY_INSTALL/mapping
export PERL5LIB=$MUGSY_INSTALL/perllibs
#For testing TBA
#export PATH=$PATH:$MUGSY_INSTALL/../../multiz-tba/trunk/
  • source the bash file
source mugsyenv.sh
  • run mugsy
mugsy --directory /home/chongdi/Streptococcus/mugsy-output -prefix mugsy_aln mugsy-input/*.fa

CLUSTALW

MAFT

TCOFFEE

Programs for producing phylogeny & phylogenetic analysis

IQ-Tree

  • Beginner's tutorial
  • Advanced tutorial
  • Model search (slow!! better done with byobu)
    • iqtree -s example.phy -m MFP
  • protein tree with fast bootstrap:
    • iqtree -s example.phy -m WAG+I+G -bb 1000 (version 1.xx; -B for latest version)
  • site-specific rates:
    • iqtree -s example.phy -rate
    • iqtree -s example.phy -wsr (version 1.xx; -t and -m from above to save time)
  • ancestral state reconstruction:
    • iqtree -s example.phy -asr (-t and -m from above to save time)

FastTree

PHYLIP

MrBayes

RaXML

  • Required arguments
    • -s alignment (in PHYLIP or FASTA)
    • -n tag
  • Simple run (ML): raxmlHPC-SSE3 -x 12345 -p 12345 -# autoMRE -s concat.fas -m GTRGAMMA -n tag -q part.txt
  • Bootstrap: raxmlHPC-SSE3 -f a -x 12345 -p 12345 -# 100 -s concat.fas -m GTRCAT -n tag -q part.txt
  • Make a file named "part.txt" with the following lines (Chanlge the number to the total length of your alignment):
DNA, gene1codon1 = 1-3765906\3
DNA, gene1codon2 = 2-3765906\3
DNA, gene1codon3 = 3-3765906\3
  • The resulting files
    • RAxML_bestTree.tag: best tree (no bootstrap)
    • RAxML_bipartitionsBranchLabels.tag: ignore
    • RAxML_bipartitions.tag: main result. Feed this tree to figtree
    • RAxML_bootstrap.tag: ignore
    • RAxML_info.tag : log file
  • Protein models
    • raxmlHPC-SSE3 -s protein.phy -n A1 -m PROTGAMMAWAS # protein, gamma, Whelan & Goldman (2001) model
    • raxmlHPC-SSE3 -s protein.phy -n A2 -m PROTGAMMAGTR # protein, gamma, user model
    • raxmlHPC-SSE3 -s protein.phy -n A1 -m PROTGAMMAWAG -# 100 -p 0123 # protein, gamma, Whelan & Goldman (2001) model, bootstrap
    • raxmlHPC-SSE3 -s protein.phy -n A1 -m PROTGAMMAWAG -o Carp,Loach # protein, gamma, Whelan & Goldman (2001) model, root on a (monophyletic group)
    • raxmlHPC-SSE3 -s protein.phy -n A1 -m PROTGAMMAWAS -# autoFC -p 0123 # protein, gamma, Whelan & Goldman (2001) model, bootstrap (at least 99 splits, auto-stopping)
    • raxmlHPC-SSE3 -f a -s protein.phy -n A1 -m PROTGAMMAWAG -# 100 -p 0123 -x 0123 # Rapid bootstrap with consensus
      • output 1, RAXML_bestTree.A1 (ml tree)
      • output 2, RAXMLbootstrap.A1 (bootstrap relicates)
      • output 3, RAXMLbipartitions.A1 (tree with boot strap values)

PhyloNet

R packages for phylogenetics

APE

Convert a tree to ultrametric using time estimates

t <- read.tree("core-tree-30otus.dnd")

t.ultra <- chronopl(t, 0)

t.hclust <- as.hclust(t.ultra)

t.dendro <- as.dendrogram(t.hclust)

heatmap(ge.var, Rowv=t.dendro) # order the rows with customized tree

phengorn

phytools

Population genetics

LDHat: test of recombination based on 4-gametes

  • filter sites: convert -seq snps.sites -loc snps.locs -freqcut 0.08
  • Pairwise LD tests: First generate "lookup" table: lkgen -lk /usr/local/LDhat/lk_files/lk_n50_t0.001 -nseq 48; then calculate pairwise LD statistics: pairwise -seq sites.txt -loc locs.txt -lk new_lk.txt
  • Run interval for recombination hot spots: interval -seq sites.txt -loc locs.txt -lk new_lk.txt -its 1000000 -bgen 5 -exact
  • Get stats: stat -input rates.txt -loc locs.txt -burn 50 (burnin=100K)
  • Plot in R:
x = read.table("rates.txt", skip=1, fill=T);
x = as.matrix(x);
burn.in = 50;
low = as.integer(nrow(x)*burn.in/100);
means<-apply(x[low:nrow(x),], 2, mean, na.rm=T);
q.95<-apply(x[low:nrow(x),], 2, quantile, probs=c(0.025, 0.5, 0.975), na.rm=T);
pos<-as.vector(as.matrix(read.table("locs.txt", as.is=T, skip=1))); 
plot(pos[1:(length(pos)-1)], y=means[2:length(means)], type="s", col=rgb(0,0,0.5),	
     xlab="SNP position (B111)", ylab="Posterior mean recombination rate", main="GBS Strep: recombination by LDhat");
lines(pos[1:(length(pos)-1)], y=q.95[1,2:length(means)], type="s", col=grey(0.75), lty="dotted");
lines(pos[1:(length(pos)-1)], y=q.95[3,2:length(means)], type="s", col=grey(0.75), lty="dotted");

ms: coalescence simulation

SFS: forward simulation

PAML: testing selection with Ka/Ks

Microbial genome databases & pipelines in Qiu Lab

borreliabase

pa2

spiro_genomes/treponema

Blast a set of genes against a bacteria genome

  1. download genome
  2. extract gene sequences & translate
  3. Make blast database
  4. Run blastp

de novo variant call with cortex_var

Create binary file of fasta genome file.

Run contex_var_31_c1 (cutoff 1 used for 1 genome)

  • --se_list is the command the reads the list you want to target (ie: list-genome.txt)
  • --kmer_size is the middle size, has to be an odd integer
  • --mem_width always choose 17
  • --mem_height always choose 100
  • --dump_binary Name your file name (ie: Genome.ctx)
/home/weigang/CORTEX_release_v1.0.5.21/bin/cortex_var_31_c1 
--se_list list-Evo.txt 
--kmer_size 31 
--mem_width 17 
--mem_height 100 
--dump_binary Evo.ctx 
> Evo.log save log file

Read each binary file (.ctx) into its own individual color list (ls Evo.ctx > Evo.colorlist) Then save these lists into their own collective colorlist.txt (ls *.ctx > colorlist.txt)

Reveal genetic variation using the Bubble Caller from cortex_var.

/home/weigang/CORTEX_release_v1.0.5.21/bin/cortex_var_31_c5 
--se_list colorlist.txt 
--kmer_size 31 
--mem_width 17 
--mem_height 100 
--dump_binary all-colors.ctx 
> all-colors.log save log file

Bubble caller will detect differences between each genome by assigning distinct colors to each genome (note that the UK spelling of color is used: colour)

  • --multicolour_bin holds your all-colors.ctx binary from the Bubble Caller
  • --detect_bubbles1 i/i Detects 1 variation between genomes i and i. i indicates the position number the genome is listed on the colorlist.txt file. If the genome is fourth on the colorlist.txt, for example, its corresponding i variable is 4
  • --output_bubbles1 Output variant reads in fasta format (ie: Evo-RefHG.var for bubble detection between

Evolved genome and Reference HG genome)

  • --print_colour_coverages necessary for output
/home/weigang/CORTEX_release_v1.0.5.21/bin/cortex_var_31_c5 
--kmer_size 31  
--mem_height 17 
--mem_width 100 
--multicolour_bin all-colors.ctx 
--detect_bubbles1 0/1 
--output_bubbles1 Evo-RefHG.var 
--print_colour_coverages  
> Evo-RefHG.log save log file

Variant call with cortex_var

Example

  • Files
MS00018_S5_L001_R1_001.fastq
MS00018_S5_L001_R2_001.fastq
KU-090401_S3_L001_R1_001.fastq
KU-090401_S3_L001_R2_001.fastq
...
  • File content
@M03268:52:000000000-AJFAY:1:1101:16970:1555 1:N:0:7
CCCATGAACGGCACGTTCACGATGCAGAAGGTGGTGACCAGGCCGGTGTCGGCGACCTCGACGTAATCGTCGGTGGGGGA
GCCGTCGCGCGGGTCCGCGCCGCGCGGCGGGAAGTACACCTTGCCGTCCATGCGGGCGCGGGCGCCCAGCAGACGACCCT
CCATGAGCGCATTCAGGAATTCGGCCTCCTGCGGCGCGGCGGTGTGCTTGATATTGAAGTCGACCGGGGTCACGATGCCG
GTGACCGGTT
+
11>1>111@11>1A1FGF1FC0F0A111010GB0ECBFGF0/AFECGGFHECE??EEGHE/EEEHEFHEHHGCEECE??/
<CFCGGCCGCCCCCCHCGGCCCG@G??@@?@-@BFFFFFFFFFFF;B@FBFBFF<?@;@@-=?@??@-EFFFFF@;@@@F
FFBF/BBF;@@FFFFFFFFFFFFF@FFFFFFF<@@@@@@@@@@@FBFFFFFFFFFFFFFFFFF@@@?@?@FFFFFFFBF@
?@@EFF@@<B
@M03268:52:000000000-AJFAY:1:1101:16136:1618 1:N:0:7
TCCTGGCCCGTGAAACCGCTTGCCCGGTACAGGTTCTGGACTACCGCCTGGCACCCGAGCATCCGTTCCCGGCGGCGCTC
GACGACGCCGAGGCGGCGTTCCTGGAACTGGTGGCCGCCGGCTACCGGCCCGAACGCATTGCGGTCGCGGGTGATTCGGCCGGTGGCGGGCTCTCGCTCGCGCTGGCCGAACGGCTGCGCGACCGGCACGGGCTGGTTCCGGCCGCGCTCGGGCTGATCGCGCCCTGGGC
+
11>>11C1A@A?11BDF?EEGAFGFG?ECCHBHHGFG1EGHFHHHCCGGC/GCGGGCEECEFGHGGFFGHGGGGCECCC<CCC@CCCC@CGCCCGGC?:@EBFBF/CFFFF0/CFG?=@=-9>AFF@=@@?@;@@@F--9:BF@A@@?@--;9-BFFFF@A-@99B?-9@?=EFFBAAE;9>?;@@BF@9-@-;@=-E@@@=@@;@?>@9@<?@?BBBFFF;?>;;-@@?@<?@?9-FBFF@-99-9E-9
...

Step 1

  • Create matched FASTQ files (python script)
#!/usr/bin/python                                                               
  
from sys import argv
script, File1, File2 = argv

# Create a dictionary listing the sequences in the first file for reference
file1 = open(File1)
dict1 = {}
for line in file1:
    if '@M03268' in line:
        tag1 = line.rstrip()[:-9]
        tail = line.rstrip()[-9:]
        dict1[tag1] = []
    else:
        dict1[tag1].append(line.rstrip())
file1.close()

# Create two output files
f1 = open(File1.replace('.fastq', '_mat.fq'), 'w')
f2 = open(File2.replace('.fastq', '_mat.fq'), 'w')

# Match the sequence
file2 = open(File2)
for line in file2:
    if dict1.has_key(line.rstrip()[:-9]): # The has_key method
        tag1 = line.rstrip()[:-9]
        f1.write(tag1 + tail + '\n')
        for j in range(3):
            f1.write(dict1[tag1][j] + '\n')
        del dict1[tag1]
        dict2 = {} # Create a temporary dictionary for sequence in the file2
        tag2 = line.rstrip()
        dict2[tag2] = []
    else:
        dict2[tag2].append(line.rstrip())
    if len(dict2[tag2]) == 3:
        f2.write(tag2 + '\n')
        for j in range(3):
            f2.write(dict2[tag2][j] + '\n')
file2.close()

f1.close()
f2.close()

Step 2. Clean Fastq Files & Run Single-color Graph & Error Cleaning

  • Create a file list showing all outcome files whose extensions need to be changed from _mat.fq to .list
bbduk.sh -Xmx1g in1=fastq_file1 in2=fastq_file2 -out1=clean1.fq -out2=clean2.fq qtrim=rl trimq=20
  • Create a file list showing all outcome files whose extensions need to be changed from _mat.fq to .list
for f in *_mat.fq; 
do 
  title=$(echo $f | cut -d'_' -f2); 
  id=$(echo $f | cut -d'_' -f1); 
  echo $f > ${id}.list${title}; 
done
  • Single color graph for sample
../../CORTEX_release_v1.0.5.21/bin/cortex_var_31_c1 
--pe_list MS00018_S5.list1,MS00018_S5.list2 
--kmer_size 31 
--mem_height 17 
--mem_width 100 
--dump_binary MS00018_S5.ctx 
--sample_id MS00018_S5 
--remove_pcr_duplicates 
--quality_score_threshold 20 > MS00018_S5.log
  • Single color graph for reference
../../CORTEX_release_v1.0.5.21/bin/cortex_var_31_c1 
--se_list ref.filelist 
--kmer_size 31 
--mem_height 17 
--mem_width 100 
--dump_binary ref.ctx 
--sample_id ref 20 > ref.log
  • Run Error Cleaning for All Samples (reference is not included)
../../CORTEX_release_v1.0.5.21/bin/cortex_var_31_c1 
--mem_height 18
--mem_width 100
--kmer_size 31
--multicolour_bin N18_S15.ctx
--remove_low_coverage_supernodes 10
--dump_binary N18_S15.cleaned.ctx

Step 3

  • Pull the name of each .cleaned.ctx file to a cleaned.list file, then create a .filelist file for all cleaned.list files.
ls file1.cleaned.ctx > file1.cleaned.list
ls file2.cleaned.ctx > file2.cleaned.list
ls ref.ctx > ref.list
ls -1 *.list > ref-sample.filelist
  • Multicolour Graph
../../CORTEX_release_v1.0.5.21/bin/cortex_var_31_c3
--mem_height 20 
--mem_width 100 
--kmer_size 31 
--colour_list ref-sample.filelist 
--dump_binary ref-sample.ctx > ref-sample.log

Step 4

  • Variation Discovery Using The Bubble Caller
../../CORTEX_release_v1.0.5.21/bin/cortex_var_31_c3 
--mem_height 20 
--mem_width 100 
--kmer_size 31 
--multicolour_bin ref-sample.ctx 
--detect_bubbles1 -1/-1 
--ref_colour 2 
--output_bubbles1 bubbles-in-sample.out 
--print_colour_coverages 
--experiment_type EachColourAHaploidSampleExceptTheRefColour 
--genome_size 8000000 > bubbles-in-sample.log

Step 5

  • Reference Genome

(When in Cluster execute "module load stampy": doesn't work; path problem) (Run the following on wallace:)

stampy.py -G ref ref.fa
stampy.py -g ref -H ref
  • Turn Into VCF with reference

Make a sample list file (from bubble or multicolor log file):

cat e1-bubbles-in-sample.log | grep CLEANED | cut -f2 > e1.sample.lis

Customize the following command based on your output files, num of colors, index of ref colors, etc

perl /home/weigang/CORTEX_release_v1.0.5.21/scripts/analyse_variants/process_calls-wq.pl --callfile e1-bubbles-in-sample.out --callfile_log e1-bubbles-in-sample.log -outvcf e1-bubbles-in-sample --outdir e1-vcfout --samplename_list e1.sample.list --num_cols 7 --stampy_bin /home/weigang/stampy-1.0.28/stampy.py --stampy_hash ref --refcol 6 --vcftools_dir /usr/local/bin --caller BC --kmer 31 --ploidy 1

Step 6. Parse VCF files

  1. Filter out low-quality sites:

vcftools --vcf pat-5.decomp.vcf --keep-filtered PASS --recode --out pat-5

  1. Extract coverage

vcftools --vcf pat-5.recode.vcf --extract-FORMAT-info COV (output file: out.COV.FORMAT)

  1. Extract genotypes

vcftools --vcf pat-5.recode.vcf --extract-FORMAT-info GT (output file: out.GT.FORMAT)

  1. Extract confidence

vcftools --vcf pat-5.recode.vcf --extract-FORMAT-info GT_CONF (output file: out.GT_CONF.FORMAT)

  1. Send GT_CONF file to R and visualize log10(conf) distribution with boxplot
  2. Use custom PERL file to filter out low-quality (e.g., GT_CONF < 30) genotype calls (flag with "?" or "na"), and make haplotype GT ("1/1" to "1", "0/0" to "0", "./." to "?)
  3. Verify variants using IGV (see IGV protocol above)

Step 7. Variant Annotation & Visualization

  1. vcf_parser.py out.decomp.vcf ref.gb (the code needs validation)
  2. Data to database & web visualization, if necessary

hmmer

Annotate proteins with TIGRFAM

hmmsearch --tblout foo.hmmout   # table output for all sequences
          --domtblout foo.dmout # table output for all domains
          -E 0.01               # level of sequence significance
          --domE 0.01           # level of domain significance
          -o /dev/null          # don't show STDOUT 
          ../../TIGRFAMs-Release-15-Sep-17-2014/TIGRFAMs_15.0_HMM.LIB # HMM profile library for tiger fams 
          GCA_000583715.2_ASM58371v2_protein.faa &                    # input/query file in FASTA

PopGenome

library(PopGenome)
g = readVCF("pvt1.recode.vcf.gz", 1000, "8", 127890000, 128102000)
pops = split(sample[,1], sample[,2]) # create a list of populations
g = set.populations(g, pops, diploid = T) # set population names

# by windows
slide = sliding.window.transform(g, width = 100, jump = 20) # nsnps, not actual length
slide = F_ST.stats(slide, mode = "nucleotide")
snp.pos = slide@region.data@biallelic.sites # SNP positions
win.num = length(slide@region.names)
win.start = numeric()
for (i in 1:win.num) {win.start[i] = snp.pos[[i]][1]}
fst = slide@nuc.F_ST.vs.all
pop.names = names(slide@populations) # population names
plot(win.start, fst[,1], type ="n", las = 1, ylab = expression(F[st]), xlab = "SNP Position", ylim = c(0,0.4))
for (i in 1:length(slide@populations)) {
  lines(win.start, fst[,i], type = "l", col = pop.group[pop.names[i],4])
}
arch.coords=c(127982050, 127992931)
abline(v = arch.coords, col = "orange")
#rect(xleft = arch.coords[1], ybottom = -1, xright = arch.coords[2], ytop = 0.5, border = "transparent", col = 2)

Velvet

Cleaning

../../bbmap/bbduk.sh -Xmx1g 
           in1=WGC067462_hunhewenku_509_combined_R1.fastq # gz file works as well
           in2=WGC067462_hunhewenku_509_combined_R2.fastq # gz file works as well
           -out1=clean1.fq 
           -out2=clean2.fq 
           qtrim=rl
           trimq=20

Running Velvet Optimizer

srun ../../VelvetOptimiser-2.2.5/VelvetOptimiser.pl 
          --t 32
          --s 31 --e 31 --x 6 # kmer sizes
          -f '-shortPaired -fastq clean1.fq -shortPaired2 -fastq clean2.fq'
          -t 4 
          --optFuncKmer ‘n50’ 
          -p prefix

PacBio assembly with canu

./canu -p staph-auto-5 -d staph-auto-5 genomeSize=2.2m -pacbio-raw pac-reads-5.tar.gz