@@ -97,6 +97,154 @@ pub fn to_string(bits: BitArray) -> Result(String, Nil) {
9797@ external ( erlang , "gleam_stdlib" , "identity" )
9898fn unsafe_to_string ( a : BitArray ) -> String
9999
100+ /// Converts a bit array to a string. Invalid bits are passed to the provided
101+ /// callback and its result is included in the final string in place of the
102+ /// invalid data.
103+ ///
104+ /// ## Examples
105+ ///
106+ /// ```gleam
107+ /// to_string_lossy(<<"A":utf8, 0x80, "1":utf8, 0:size(5)>>, fn(_) { "�" })
108+ /// // -> "A�1�"
109+ /// ```
110+ ///
111+ pub fn to_string_lossy (
112+ bits : BitArray ,
113+ map_invalid_bits : fn ( BitArray ) -> String ,
114+ ) -> String {
115+ to_string_lossy_impl ( bits , map_invalid_bits , "" )
116+ }
117+
118+ @ target ( erlang )
119+ fn to_string_lossy_impl (
120+ bits : BitArray ,
121+ map_invalid_bits : fn ( BitArray ) -> String ,
122+ acc : String ,
123+ ) -> String {
124+ case bits {
125+ << >> -> acc
126+
127+ << x : utf8_codepoint , rest : bits >> ->
128+ to_string_lossy_impl (
129+ rest ,
130+ map_invalid_bits ,
131+ acc <> string . from_utf_codepoints ( [ x ] ) ,
132+ )
133+
134+ << x : bytes - 1 , rest : bits >> ->
135+ to_string_lossy_impl ( rest , map_invalid_bits , acc <> map_invalid_bits ( x ) )
136+
137+ _ -> acc <> map_invalid_bits ( bits )
138+ }
139+ }
140+
141+ // The following is the same as the above function but supports the JavaScript
142+ // target due to not using the `utf8_codepoint` bit array segment type. Once
143+ // the JavaScript target supports `utf8_codepoint` this function should be
144+ // removed.
145+ @ target ( javascript )
146+ fn to_string_lossy_impl (
147+ bits : BitArray ,
148+ map_invalid_bits : fn ( BitArray ) -> String ,
149+ acc : String ,
150+ ) -> String {
151+ case bits {
152+ << >> -> acc
153+
154+ // 1-byte UTF-8 character
155+ << b0 , rest : bytes >> if b0 <= 0x7F -> {
156+ let codepoint_value = b0
157+
158+ let acc =
159+ acc
160+ <> case string . utf_codepoint ( codepoint_value ) {
161+ Ok ( codepoint ) -> string . from_utf_codepoints ( [ codepoint ] )
162+ Error ( Nil ) -> map_invalid_bits ( << b0 >> )
163+ }
164+
165+ to_string_lossy_impl ( rest , map_invalid_bits , acc )
166+ }
167+
168+ // 2-byte UTF-8 character
169+ << b0 , b1 , rest : bytes >>
170+ if b0 >= 0xC0 && b0 <= 0xDF && b1 >= 0x80 && b1 <= 0xBF
171+ -> {
172+ let codepoint_value =
173+ int . bitwise_and ( b0 , 0x1F ) * 64 + int . bitwise_and ( b1 , 0x3F )
174+
175+ let acc =
176+ acc
177+ <> case string . utf_codepoint ( codepoint_value ) {
178+ Ok ( codepoint ) -> string . from_utf_codepoints ( [ codepoint ] )
179+ Error ( Nil ) -> map_invalid_bits ( << b0 , b1 >> )
180+ }
181+
182+ to_string_lossy_impl ( rest , map_invalid_bits , acc )
183+ }
184+
185+ // 3-byte UTF-8 character
186+ << b0 , b1 , b2 , rest : bytes >>
187+ if b0 >= 0xE0
188+ && b0 <= 0xEF
189+ && b1 >= 0x80
190+ && b1 <= 0xBF
191+ && b2 >= 0x80
192+ && b2 <= 0xBF
193+ -> {
194+ let codepoint_value =
195+ int . bitwise_and ( b0 , 0x0F )
196+ * 4096
197+ + int . bitwise_and ( b1 , 0x3F )
198+ * 64
199+ + int . bitwise_and ( b2 , 0x3F )
200+
201+ let acc =
202+ acc
203+ <> case string . utf_codepoint ( codepoint_value ) {
204+ Ok ( codepoint ) -> string . from_utf_codepoints ( [ codepoint ] )
205+ Error ( Nil ) -> map_invalid_bits ( << b0 , b1 , b2 >> )
206+ }
207+
208+ to_string_lossy_impl ( rest , map_invalid_bits , acc )
209+ }
210+
211+ // 4-byte UTF-8 character
212+ << b0 , b1 , b2 , b3 , rest : bytes >>
213+ if b0 >= 0xF0
214+ && b0 <= 0xF7
215+ && b1 >= 0x80
216+ && b1 <= 0xBF
217+ && b2 >= 0x80
218+ && b2 <= 0xBF
219+ && b3 >= 0x80
220+ && b3 <= 0xBF
221+ -> {
222+ let codepoint_value =
223+ int . bitwise_and ( b0 , 0x07 )
224+ * 262_144
225+ + int . bitwise_and ( b1 , 0x3F )
226+ * 4096
227+ + int . bitwise_and ( b2 , 0x3F )
228+ * 64
229+ + int . bitwise_and ( b3 , 0x3F )
230+
231+ let acc =
232+ acc
233+ <> case string . utf_codepoint ( codepoint_value ) {
234+ Ok ( codepoint ) -> string . from_utf_codepoints ( [ codepoint ] )
235+ Error ( Nil ) -> map_invalid_bits ( << b0 , b1 , b2 , b3 >> )
236+ }
237+
238+ to_string_lossy_impl ( rest , map_invalid_bits , acc )
239+ }
240+
241+ << x : bytes - 1 , rest : bytes >> ->
242+ to_string_lossy_impl ( rest , map_invalid_bits , acc <> map_invalid_bits ( x ) )
243+
244+ _ -> acc <> map_invalid_bits ( bits )
245+ }
246+ }
247+
100248/// Creates a new bit array by joining multiple binaries.
101249///
102250/// ## Examples
0 commit comments