Skip to content

Commit caf40b6

Browse files
jarredholmanpythonicrubyist
authored andcommitted
Unescape the special hex code escape sequences (#73)
Unescape the special hex code escape sequences (#73)
1 parent 3f1f724 commit caf40b6

File tree

4 files changed

+19
-5
lines changed

4 files changed

+19
-5
lines changed

lib/creek/shared_strings.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ def self.parse_shared_string_from_document(xml)
3131
xml.css('si').each_with_index do |si, idx|
3232
text_nodes = si.css('t')
3333
if text_nodes.count == 1 # plain text node
34-
dictionary[idx] = text_nodes.first.content
34+
dictionary[idx] = Creek::Styles::Converter.unescape_string(text_nodes.first.content)
3535
else # rich text nodes with text fragments
36-
dictionary[idx] = text_nodes.map(&:content).join('')
36+
dictionary[idx] = text_nodes.map { |n| Creek::Styles::Converter.unescape_string(n.content) }.join('')
3737
end
3838
end
3939

lib/creek/styles/converter.rb

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@ module Creek
44
class Styles
55
class Converter
66
include Creek::Styles::Constants
7+
8+
# Excel non-printable character escape sequence
9+
HEX_ESCAPE_REGEXP = /_x[0-9A-Za-z]{4}_/
10+
711
##
812
# The heart of typecasting. The ruby type is determined either explicitly
913
# from the cell xml or implicitly from the cell style, and this
@@ -45,9 +49,9 @@ def self.call(value, type, style, options = {})
4549
when 'b'
4650
value.to_i == 1
4751
when 'str'
48-
value
52+
unescape_string(value)
4953
when 'inlineStr'
50-
value
54+
unescape_string(value)
5155

5256
##
5357
# Type can also be determined by a style,
@@ -112,6 +116,12 @@ def self.convert_bignum(value)
112116
end
113117
end
114118

119+
def self.unescape_string(value)
120+
# excel encodes some non-printable characters using a hex code in the format _xHHHH_
121+
# e.g. Carriage Return (\r) is encoded as _x000D_
122+
value.gsub(HEX_ESCAPE_REGEXP) { |match| match[2, 4].to_i(16).chr(Encoding::UTF_8) }
123+
end
124+
115125
private
116126

117127
def self.base_date(options)

spec/fixtures/sst.xml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,4 +75,7 @@
7575
<t>B2</t>
7676
</r>
7777
</si>
78+
<si>
79+
<t>Cell with_x000D_escaped_x000D_characters</t>
80+
</si>
7881
</sst>

spec/shared_string_spec.rb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,13 @@
77
doc = Nokogiri::XML(shared_strings_xml_file)
88
dictionary = Creek::SharedStrings.parse_shared_string_from_document(doc)
99

10-
expect(dictionary.keys.size).to eq(5)
10+
expect(dictionary.keys.size).to eq(6)
1111
expect(dictionary[0]).to eq('Cell A1')
1212
expect(dictionary[1]).to eq('Cell B1')
1313
expect(dictionary[2]).to eq('My Cell')
1414
expect(dictionary[3]).to eq('Cell A2')
1515
expect(dictionary[4]).to eq('Cell B2')
16+
expect(dictionary[5]).to eq("Cell with\rescaped\rcharacters")
1617
end
1718

1819
end

0 commit comments

Comments
 (0)