diff --git a/src/bindings/ruby/lib/hammer/internal.rb b/src/bindings/ruby/lib/hammer/internal.rb index 0c462fe2d869ddf05e9fbd5700ce45b650644c6a..12d797f5a8007a1cfd43e3ca1b7848c7bc5a6baa 100644 --- a/src/bindings/ruby/lib/hammer/internal.rb +++ b/src/bindings/ruby/lib/hammer/internal.rb @@ -39,8 +39,11 @@ module Hammer :len, :size_t def token - # TODO: Encoding? Should probably be the same encoding as the string the token was created with. - return self[:token].read_string(self[:len]) #.force_encoding('UTF-8') + # TODO: Encoding? + # Should be the same encoding as the string the token was created with. + # But how do we get to this knowledge at this point? + # Cheap solution: Just ask the user (additional parameter with default value of UTF-8). + return self[:token].read_string(self[:len]).force_encoding('UTF-8') end end diff --git a/src/bindings/ruby/test/parser_test.rb b/src/bindings/ruby/test/parser_test.rb index f5f12f2dcdb84962ce9434847b8dab1950617e16..abbd1c1e460acab22975abfabfd4f33311789a25 100644 --- a/src/bindings/ruby/test/parser_test.rb +++ b/src/bindings/ruby/test/parser_test.rb @@ -80,4 +80,14 @@ class ParserTest < Minitest::Test refute_nil parser.parse('今日a') end + + def test_token_encoding(encoding='UTF-8') + string = '今日'.encode(encoding) + parser = Hammer::Parser.token(string) + assert_equal string, parser.parse(string)[:ast][:data][:bytes].token + end + + def test_token_encoding_2 + test_token_encoding('EUC-JP') + end end