素振り

順序のある集合を分散させる素振り。

require 'rbtree'
require 'enumerator'
require 'drb'

class RBTreeFront
  def initialize(tree)
    @tree = tree
  end

  def bound_for(lower, upper)
    lower = @tree.first[0] unless lower
    upper = @tree.last[0] unless upper
    @tree.enum_for(:bound, lower, upper)
  end

  def bound_slice(n, lower=nil, upper=nil, &blk)
    bound_for(lower, upper).each_slice(n, &blk)
  end
end

def add_key(tree, word, path, lineno, pos)
  found = tree.lower_bound([word])
  word = found[0][0] if found && found[0][0] == word
  key = [word, path, lineno, pos]
  tree[key] = true
end

tree = RBTree.new
while line = ARGF.gets
  line.scan(/(\w+)/) do |words|
    pos = Regexp.last_match.begin(0)
    add_key(tree, words[0], ARGF.path, ARGF.file.lineno, pos)
  end
end

front = RBTreeFront.new(tree)
DRb.start_service(nil, front)
puts DRb.uri
DRb.thread.join

add_keyでへんなことやってるのは、wordに渡されるStringと同じ文字列がすでにあったら、そのオブジェクトをシェアするための細工。メモリを気にしてやってるんだけど、貧乏臭いね。

frontはRBTreeにbound_sliceを追加するために置いたのだけど、RBTreeに直接メソッドを追加した方がいいのかもしれない。bound_sliceはeach_sliceのように、n個ずつまとめてyieldするもの。これがあるとdRubyの通信回数を減らすことができるので、大量なデータをブラウズするような(たとえばMapReduceごっことか)ケースに使えるようになる。はず。

require 'drb'

uri = ARGV.shift
num = (ARGV.shift || 10).to_i
DRb.start_service
ro = DRbObject.new_with_uri(uri)
ro.bound_slice(num, ['def'], ['def' + "\0"]) do |ary|
  p ary
end

@m_seki の

I like ruby tooから引っ越し