# coding: utf-8
# Data scrapper for https://gerrit.wikimedia.org/r/#/c/49776/
require 'restclient'
require 'nokogiri'
require 'unicode_utils'
require 'pp'
n = Nokogiri.HTML RestClient.get 'http://developer.mimer.com/charts/tailorings.htm'
data = n.css('table tr').drop(3).map do |e|
langcode = e.at('td:first-child .language, td:first-child').children.last.text[/\(([a-z-]+)/, 1]
$stderr.puts langcode
rules_container = e.at('td:last-child').at('b')
rules = rules_container ? (rules_container.text.gsub('<', '<').gsub("\u00A0", ' ')) : ''
tailored_first_letters = []
rules.split('&').each do |chunk|
next if chunk.strip.empty?
chunk.strip.split(/\s+/).each_cons(3) do |a, mode, b|
next unless mode =~ /\A<+\z/
if mode == '<'
b = 'İ' if b == 'ı' # fix presence of dotted/dotless i for Turkish and Azerbaijani
next if b.ascii_only? and b.bytesize == 1 # skip trivial cases
tailored_first_letters << UnicodeUtils.upcase(b, langcode.to_sym)
end
end
end
[langcode, rules, tailored_first_letters]
end
data = data.sort_by{|a| a[0] }
puts data.map{|langcode, rules, letters|
letters = letters.map{|lt| '"' + lt + '"' } # happily assume there are no quotes there
"'#{langcode}' => array( #{letters.join ", "} ),".sub('( )', '()')
}
$stderr.puts data.map{|langcode, rules, letters|
"#{langcode}: #{letters.join " "}"
}