Monday, May 23, 2005

Ruby script to find duplicate sequences of characters in STDIN

chunk_size = 40
s = STDIN.readlines().join("").gsub(/\s/, "")
chunks = Array.new
0.upto(s.size - chunk_size) {|i|
chunks << s[i..i+chunk_size]
}
chunk_to_count_map = Hash.new
chunks.each {|chunk|
chunk_to_count_map[chunk] = chunk_to_count_map.has_key?(chunk) ? chunk_to_count_map[chunk]+1 : 1
}
chunk_to_count_map.keys.sort{|a,b| chunk_to_count_map[a]<=>chunk_to_count_map[b]}.each {|chunk|
if chunk_to_count_map[chunk] > 1
puts chunk_to_count_map[chunk].to_s + "\t" + chunk
end
}

1 Comments:

At 5/23/2005 9:27 p.m., Blogger Jonathan said...

find . -iname '*.rhtml' | xargs cat | ruby c:\junk\rhtml.rb | grep --invert-match stylesheet

 

Post a Comment

<< Home