# Dengue & H1N1 Evolutionary Tree Reconstruction – Class Assignment

I used 2 Ruby scripts to create the trees by comparing genomes. H1N1 dataset was aligned, Dengue was unaligned. So I used poisson correction like thing on hamming distance for aligned sequence and used the euclidean distance between $k$-mer frequency vectors for nearly additive distance metric based hierarchical clustering.

The following is the aligned tree construction script:
``` #!/usr/bin/env ruby```

```require 'rubygems' require 'bio' require 'fastercsv'```

```def poisson l1, l2 a1 = l1.split(//) a2 = l2.split(//) sum = 0 a1.each_index{ |i| if a1[i] != "N" && a2[i] != "N" && a1[i] != a2[i] sum += 1 end } -Math.log(1- (sum.to_f/a1.size.to_f)) end```

```file = Bio::FastaFormat.open(ARGV.shift) seqs = Hash.new(0) file.each do |entry| seqs[entry.entry_id] = entry.to_seq end```

``` M = Hash.new(0) seqs.each_key{|i| seqs.each_key{|j| M[i] = Hash.new(0) if !M.has_key?(i) M[j] = Hash.new(0) if !M.has_key?(j) M[i][j] = M[j][i] = poisson(seqs[i], seqs[j]) } } # Neighbour Joining Algorithm Z = [] seqs.each_key { |key| tree = Bio::Tree.new tree.root = Bio::Tree::Node.new(key) Z << tree } dist = Hash.new(0) Z.each { |i| Z.each { |j| dist[i] = Hash.new(0) if !dist.has_key?(i) dist[j] = Hash.new(0) if !dist.has_key?(j) dist[j][i] = dist[i][j] = M[i.root.name][j.root.name] } } n = seqs.size (2..n).each { |i| u = Hash.new(0) Z.each { |a| sum = 0 Z.each { |d| sum += dist[d][a] } u[a] = (1/(n-2)) * sum } a_min = nil b_min = nil score_min = 2**30 - 1 Z.each { |a| Z.each { |b| if a != b score = dist[a][b] - u[a] - u[b] if(score < score_min) a_min = a b_min = b score_min = score end end } } a = a_min b = b_min r_a = dist[a][b]/2 + (u[a] - u[b])/2 r_b = dist[a][b]/2 + (u[a] - u[b])/2 Z.delete(a) Z.delete(b) c = Bio::Tree.new c.root = Bio::Tree::Node.new c.concat(a) c.add_edge(c.root,a.root,Bio::Tree::Edge.new(r_a)) c.concat(b) c.add_edge(c.root,b.root,Bio::Tree::Edge.new(r_b)) Z.each { |d| dist[d] = Hash.new(0) if !dist.has_key?(d) dist[c] = Hash.new(0) if !dist.has_key?(c) dist[d][c] = dist[c][d] = (dist[a][d] + dist[b][d] - dist[a][b])/2 } Z << c } ```

```puts Z[0].newick ```
The following is the unaligned tree construction script. It uses a different distance metric. The hierarchical clustering is same as above.
``` #!/usr/bin/env ruby```

```require 'rubygems' require 'bio' require 'fastercsv'```

```def euclidean l1, l2 sum = 0 l = nil if l1.size >= l2.size l = l1 else l = l2 end l.each_key{ |i| sum += ((l1[i] - l2[i]) ** 2) } Math.sqrt(sum) end```

```file = Bio::FastaFormat.open(ARGV.shift) _n_mers = Hash.new(0) file.each do |entry| _n_mer = Hash.new(0) seq = entry.to_seq seq.window_search(5,1) do |s| _n_mer[s] += 1 end _n_mers[entry.entry_id] = _n_mer end```

``` M = Hash.new(0) _n_mers.each_key{|i| _n_mers.each_key{|j| M[i] = Hash.new(0) if !M.has_key?(i) M[j] = Hash.new(0) if !M.has_key?(j) M[i][j] = M[j][i] = euclidean(_n_mers[i], _n_mers[j]) } } # Neighbour Joining Algorithm Z = [] _n_mers.each_key { |key| tree = Bio::Tree.new tree.root = Bio::Tree::Node.new(key) Z << tree } dist = Hash.new(0) Z.each { |i| Z.each { |j| dist[i] = Hash.new(0) if !dist.has_key?(i) dist[j] = Hash.new(0) if !dist.has_key?(j) dist[j][i] = dist[i][j] = M[i.root.name][j.root.name] } } n = _n_mers.size (2..n).each { |i| u = Hash.new(0) Z.each { |a| sum = 0 Z.each { |d| sum += dist[d][a] } u[a] = (1/(n-2)) * sum } a_min = nil b_min = nil score_min = 2**30 - 1 Z.each { |a| Z.each { |b| if a != b score = dist[a][b] - u[a] - u[b] if(score < score_min) a_min = a b_min = b score_min = score end end } } a = a_min b = b_min r_a = dist[a][b]/2 + (u[a] - u[b])/2 r_b = dist[a][b]/2 + (u[a] - u[b])/2 Z.delete(a) Z.delete(b) c = Bio::Tree.new c.root = Bio::Tree::Node.new c.concat(a) c.add_edge(c.root,a.root,Bio::Tree::Edge.new(r_a)) c.concat(b) c.add_edge(c.root,b.root,Bio::Tree::Edge.new(r_b)) Z.each { |d| dist[d] = Hash.new(0) if !dist.has_key?(d) dist[c] = Hash.new(0) if !dist.has_key?(c) dist[d][c] = dist[c][d] = (dist[a][d] + dist[b][d] - dist[a][b])/2 } Z << c } annot_file = ARGV.shift if !annot_file.nil? annots = FasterCSV.read(annot_file) Z[0].leaves.each{ |leaf| annots.each{ |annot| if annot[0].include?(leaf.name) leaf.name = annot.to_csv end } } end ```

`puts Z[0].newick`