Dengue & H1N1 Evolutionary Tree Reconstruction – Class Assignment

I used 2 Ruby scripts to create the trees by comparing genomes. H1N1 dataset was aligned, Dengue was unaligned. So I used poisson correction like thing on hamming distance for aligned sequence and used the euclidean distance between k-mer frequency vectors for nearly additive distance metric based hierarchical clustering.

The following is the aligned tree construction script:

#!/usr/bin/env ruby

require 'rubygems'
require 'bio'
require 'fastercsv'

def poisson l1, l2
a1 = l1.split(//)
a2 = l2.split(//)
sum = 0
a1.each_index{ |i|
if a1[i] != "N" && a2[i] != "N" && a1[i] != a2[i]
sum += 1
end
}
-Math.log(1- (sum.to_f/a1.size.to_f))
end

file = Bio::FastaFormat.open(ARGV.shift)
seqs = Hash.new(0)
file.each do |entry|
seqs[entry.entry_id] = entry.to_seq
end

M = Hash.new(0)
seqs.each_key{|i|
seqs.each_key{|j|
M[i] = Hash.new(0) if !M.has_key?(i)
M[j] = Hash.new(0) if !M.has_key?(j)
M[i][j] = M[j][i] = poisson(seqs[i], seqs[j])
}
}

# Neighbour Joining Algorithm
Z = []
seqs.each_key { |key|
tree = Bio::Tree.new
tree.root = Bio::Tree::Node.new(key)
Z << tree
}

dist = Hash.new(0)
Z.each { |i|
Z.each { |j|
dist[i] = Hash.new(0) if !dist.has_key?(i)
dist[j] = Hash.new(0) if !dist.has_key?(j)
dist[j][i] = dist[i][j] = M[i.root.name][j.root.name]
}
}

n = seqs.size
(2..n).each { |i|
u = Hash.new(0)
Z.each { |a|
sum = 0
Z.each { |d|
sum += dist[d][a]
}
u[a] = (1/(n-2)) * sum
}
a_min = nil
b_min = nil
score_min = 2**30 - 1
Z.each { |a|
Z.each { |b|
if a != b
score = dist[a][b] - u[a] - u[b]
if(score < score_min)
a_min = a
b_min = b
score_min = score
end
end
}
}
a = a_min
b = b_min
r_a = dist[a][b]/2 + (u[a] - u[b])/2
r_b = dist[a][b]/2 + (u[a] - u[b])/2
Z.delete(a)
Z.delete(b)
c = Bio::Tree.new
c.root = Bio::Tree::Node.new
c.concat(a)
c.add_edge(c.root,a.root,Bio::Tree::Edge.new(r_a))
c.concat(b)
c.add_edge(c.root,b.root,Bio::Tree::Edge.new(r_b))
Z.each { |d|
dist[d] = Hash.new(0) if !dist.has_key?(d)
dist[c] = Hash.new(0) if !dist.has_key?(c)
dist[d][c] = dist[c][d] = (dist[a][d] + dist[b][d] - dist[a][b])/2
}
Z << c
}

puts Z[0].newick

The following is the unaligned tree construction script. It uses a different distance metric. The hierarchical clustering is same as above.

#!/usr/bin/env ruby

require 'rubygems'
require 'bio'
require 'fastercsv'

def euclidean l1, l2
sum = 0
l = nil
if l1.size >= l2.size
l = l1
else
l = l2
end
l.each_key{ |i|
sum += ((l1[i] - l2[i]) ** 2)
}
Math.sqrt(sum)
end

file = Bio::FastaFormat.open(ARGV.shift)
_n_mers = Hash.new(0)
file.each do |entry|
_n_mer = Hash.new(0)
seq = entry.to_seq
seq.window_search(5,1) do |s|
_n_mer[s] += 1
end
_n_mers[entry.entry_id] = _n_mer
end

M = Hash.new(0)
_n_mers.each_key{|i|
_n_mers.each_key{|j|
M[i] = Hash.new(0) if !M.has_key?(i)
M[j] = Hash.new(0) if !M.has_key?(j)
M[i][j] = M[j][i] = euclidean(_n_mers[i], _n_mers[j])
}
}

# Neighbour Joining Algorithm
Z = []
_n_mers.each_key { |key|
tree = Bio::Tree.new
tree.root = Bio::Tree::Node.new(key)
Z << tree
}

dist = Hash.new(0)
Z.each { |i|
Z.each { |j|
dist[i] = Hash.new(0) if !dist.has_key?(i)
dist[j] = Hash.new(0) if !dist.has_key?(j)
dist[j][i] = dist[i][j] = M[i.root.name][j.root.name]
}
}

n = _n_mers.size
(2..n).each { |i|
u = Hash.new(0)
Z.each { |a|
sum = 0
Z.each { |d|
sum += dist[d][a]
}
u[a] = (1/(n-2)) * sum
}
a_min = nil
b_min = nil
score_min = 2**30 - 1
Z.each { |a|
Z.each { |b|
if a != b
score = dist[a][b] - u[a] - u[b]
if(score < score_min)
a_min = a
b_min = b
score_min = score
end
end
}
}
a = a_min
b = b_min
r_a = dist[a][b]/2 + (u[a] - u[b])/2
r_b = dist[a][b]/2 + (u[a] - u[b])/2
Z.delete(a)
Z.delete(b)
c = Bio::Tree.new
c.root = Bio::Tree::Node.new
c.concat(a)
c.add_edge(c.root,a.root,Bio::Tree::Edge.new(r_a))
c.concat(b)
c.add_edge(c.root,b.root,Bio::Tree::Edge.new(r_b))
Z.each { |d|
dist[d] = Hash.new(0) if !dist.has_key?(d)
dist[c] = Hash.new(0) if !dist.has_key?(c)
dist[d][c] = dist[c][d] = (dist[a][d] + dist[b][d] - dist[a][b])/2
}
Z << c
}

annot_file = ARGV.shift
if !annot_file.nil?
annots = FasterCSV.read(annot_file)
Z[0].leaves.each{ |leaf|
annots.each{ |annot|
if annot[0].include?(leaf.name)
leaf.name = annot.to_csv
end
}
}
end

puts Z[0].newick

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s