Skip to content

Commit bbbe4d9

Browse files
committed
improve and fix Unicode support in String theory.
Unicode characters were not handled in the same way in SMT solvers. Z3, CVC4, and CVC5 require proper escaping. Z3 also supports Java-based Unicode characters. Princess only allows 16bit-sized Unicode, i.e., up to 0x0FFFF, which misses surrogate pairs up to 0x2FFFF as defined in SMTLIB.
1 parent 834b695 commit bbbe4d9

File tree

8 files changed

+239
-293
lines changed

8 files changed

+239
-293
lines changed

src/org/sosy_lab/java_smt/api/StringFormulaManager.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,10 @@
2020
public interface StringFormulaManager {
2121

2222
/**
23-
* Returns a {@link StringFormula} representing the given constant.
23+
* Returns a {@link StringFormula} representing the given constant String. The String value is
24+
* expected to be UTF16, i.e., plain Java string with Unicode characters. JavaSMT handles escaping
25+
* internally, as some solvers follow the SMTLIB standard and escape Unicode characters with curly
26+
* braces. We hide this detail from the user and allow to use plain Java Strings.
2427
*
2528
* @param value the string value the returned <code>Formula</code> should represent
2629
* @return a Formula representing the given value

src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,15 @@
88

99
package org.sosy_lab.java_smt.basicimpl;
1010

11+
import static com.google.common.base.Preconditions.checkArgument;
1112
import static org.sosy_lab.java_smt.basicimpl.AbstractFormulaManager.checkVariableName;
1213

1314
import com.google.common.collect.Iterables;
1415
import com.google.common.collect.Lists;
1516
import java.util.Collections;
1617
import java.util.List;
18+
import java.util.regex.Matcher;
19+
import java.util.regex.Pattern;
1720
import org.sosy_lab.java_smt.api.BooleanFormula;
1821
import org.sosy_lab.java_smt.api.FormulaType;
1922
import org.sosy_lab.java_smt.api.NumeralFormula;
@@ -27,6 +30,16 @@ public abstract class AbstractStringFormulaManager<TFormulaInfo, TType, TEnv, TF
2730
extends AbstractBaseFormulaManager<TFormulaInfo, TType, TEnv, TFuncDecl>
2831
implements StringFormulaManager {
2932

33+
private static final Pattern UNICODE_ESCAPE_PATTERN =
34+
Pattern.compile(
35+
// start with "\\u"
36+
"\\\\u"
37+
// either a plain Unicode letter like "\\u0061"
38+
+ "((?<codePoint>[0-9a-fA-F]{4})"
39+
+ "|"
40+
// or curly brackets like "\\u{61}"
41+
+ "(\\{(?<codePointInBrackets>[0-9a-fA-F]{1,5})}))");
42+
3043
protected AbstractStringFormulaManager(
3144
FormulaCreator<TFormulaInfo, TType, TEnv, TFuncDecl> pCreator) {
3245
super(pCreator);
@@ -46,9 +59,49 @@ protected IntegerFormula wrapInteger(TFormulaInfo formulaInfo) {
4659

4760
@Override
4861
public StringFormula makeString(String value) {
62+
checkArgument(
63+
isCodePointInRange(value),
64+
"String constant is out of supported Unicode range (Plane 0-2).");
4965
return wrapString(makeStringImpl(value));
5066
}
5167

68+
/** returns whether all Unicode characters in Planes 0-2. */
69+
static boolean isCodePointInRange(String str) {
70+
return str.codePoints().allMatch(codePoint -> 0x00000 <= codePoint && codePoint <= 0x2FFFF);
71+
}
72+
73+
/** Replace Unicode letters in UTF16 representation with their escape sequences. */
74+
protected static String escapeUnicodeForSmtlib(String input) {
75+
StringBuilder sb = new StringBuilder();
76+
for (int codePoint : input.codePoints().toArray()) {
77+
if (0x20 <= codePoint && codePoint <= 0x7E) {
78+
sb.appendCodePoint(codePoint); // normal printable chars
79+
} else {
80+
sb.append("\\u{").append(String.format("%05X", codePoint)).append("}");
81+
}
82+
}
83+
return sb.toString();
84+
}
85+
86+
/** Replace escaped Unicode letters in SMTLIB representation with their UTF16 pendant. */
87+
public static String unescapeUnicodeForSmtlib(String input) {
88+
Matcher matcher = UNICODE_ESCAPE_PATTERN.matcher(input);
89+
StringBuilder sb = new StringBuilder();
90+
while (matcher.find()) {
91+
String hexCodePoint = matcher.group("codePoint");
92+
if (hexCodePoint == null) {
93+
hexCodePoint = matcher.group("codePointInBrackets");
94+
}
95+
int codePoint = Integer.parseInt(hexCodePoint, 16);
96+
checkArgument(
97+
0x00000 <= codePoint && codePoint <= 0x2FFFF,
98+
"SMTLIB does only specify Unicode letters from Planes 0-2");
99+
matcher.appendReplacement(sb, Character.toString(codePoint));
100+
}
101+
matcher.appendTail(sb);
102+
return sb.toString();
103+
}
104+
52105
protected abstract TFormulaInfo makeStringImpl(String value);
53106

54107
@Override

src/org/sosy_lab/java_smt/solvers/cvc4/CVC4StringFormulaManager.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ class CVC4StringFormulaManager extends AbstractStringFormulaManager<Expr, Type,
3030
@Override
3131
protected Expr makeStringImpl(String pValue) {
3232
// The boolean enables escape characters!
33-
return exprManager.mkConst(new CVC4String(pValue, true));
33+
return exprManager.mkConst(new CVC4String(escapeUnicodeForSmtlib(pValue), true));
3434
}
3535

3636
@Override

src/org/sosy_lab/java_smt/solvers/cvc5/CVC5StringFormulaManager.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ class CVC5StringFormulaManager extends AbstractStringFormulaManager<Term, Sort,
2727

2828
@Override
2929
protected Term makeStringImpl(String pValue) {
30-
return solver.mkString(pValue, true);
30+
return solver.mkString(escapeUnicodeForSmtlib(pValue), true);
3131
}
3232

3333
@Override

src/org/sosy_lab/java_smt/solvers/princess/PrincessStringFormulaManager.java

Lines changed: 9 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
package org.sosy_lab.java_smt.solvers.princess;
1010

11+
import static com.google.common.base.Preconditions.checkArgument;
1112
import static org.sosy_lab.java_smt.solvers.princess.PrincessEnvironment.toITermSeq;
1213

1314
import ap.parser.IAtom;
@@ -29,104 +30,16 @@ public class PrincessStringFormulaManager
2930
super(pCreator);
3031
}
3132

32-
/**
33-
* Tries to parse an escaped Unicode character
34-
*
35-
* <p>Returns the original String if parsing is not possible.
36-
*/
37-
private static String literalOrSkip(String pToken) {
38-
String literal;
39-
if (pToken.startsWith("\\u{")) {
40-
if (pToken.length() > 9 || !pToken.endsWith("}")) {
41-
// Abort if there is no closing bracket, or if there are too many digits for the literal
42-
// The longest allowed literal is \\u{d5 d4 d3 d2 d1}
43-
return pToken;
44-
}
45-
literal = pToken.substring(3, pToken.length() - 1);
46-
} else {
47-
if (pToken.length() != 6) {
48-
// Abort if there are not exactly 4 digits
49-
// The literal must have this form: \\u d3 d2 d1 d0
50-
return pToken;
51-
}
52-
literal = pToken.substring(2);
53-
}
54-
55-
// Try to parse the digits as a (hexadecimal) integer
56-
// Abort if there is an error
57-
int value;
58-
try {
59-
value = Integer.parseInt(literal, 16);
60-
} catch (NumberFormatException e) {
61-
return pToken;
62-
}
63-
64-
// Return the Unicode letter if it fits into a single 16bit character
65-
// Otherwise throw an exception as we don't support Unicode characters with more than 4 digits
66-
char[] chars = Character.toChars(value);
67-
if (chars.length != 1) {
68-
throw new IllegalArgumentException();
69-
} else {
70-
return String.valueOf(chars[0]);
71-
}
72-
}
73-
74-
/** Replace escape sequences for Unicode letters with their UTF16 representation. */
75-
static String unescapeString(String pInput) {
76-
StringBuilder builder = new StringBuilder();
77-
while (!pInput.isEmpty()) {
78-
// Search for the next escape sequence
79-
int start = pInput.indexOf("\\u");
80-
if (start == -1) {
81-
// Append the rest of the String to the output if there are no more escaped Unicode
82-
// characters
83-
builder.append(pInput);
84-
pInput = "";
85-
} else {
86-
// Store the prefix up to the escape sequence
87-
String prefix = pInput.substring(0, start);
88-
89-
// Skip ahead and get the escape sequence
90-
pInput = pInput.substring(start);
91-
String value;
92-
if (pInput.charAt(2) == '{') {
93-
// Sequence has the form \\u{d5 d4 d3 d2 d1 d0}
94-
// Find the closing bracket for the literal:
95-
int stop = pInput.indexOf('}');
96-
if (stop == -1) {
97-
// Use the index right after "\\u{" if there is no closing bracket
98-
stop = 2;
99-
}
100-
value = pInput.substring(0, stop + 1);
101-
pInput = pInput.substring(stop + 1);
102-
} else {
103-
// Sequence has the form \\u d3 d2 d1 d0
104-
int stop = 2;
105-
while (stop < pInput.length()) {
106-
char c = pInput.charAt(stop);
107-
if (Character.digit(c, 16) == -1) {
108-
break;
109-
}
110-
stop++;
111-
}
112-
value = pInput.substring(0, stop);
113-
pInput = pInput.substring(stop);
114-
}
115-
116-
// Try to parse the escape sequence to replace it with its 16bit Unicode character
117-
// If parsing fails just keep it in the String
118-
String nextToken = literalOrSkip(value);
119-
120-
// Collect the prefix and the (possibly) translated escape sequence
121-
builder.append(prefix).append(nextToken);
122-
}
123-
}
124-
return builder.toString();
125-
}
126-
12733
@Override
12834
protected IExpression makeStringImpl(String value) {
129-
return PrincessEnvironment.stringTheory.string2Term(unescapeString(value));
35+
value = unescapeUnicodeForSmtlib(value);
36+
checkArgument(!containsSurrogatePair(value), "Princess does not support surrogate pairs.");
37+
return PrincessEnvironment.stringTheory.string2Term(value);
38+
}
39+
40+
/** returns whether any character of the string is part of a surrogate pair. */
41+
private static boolean containsSurrogatePair(String str) {
42+
return str.codePoints().anyMatch(Character::isSupplementaryCodePoint);
13043
}
13144

13245
@Override

src/org/sosy_lab/java_smt/solvers/princess/UnicodeEscapeTest.java

Lines changed: 0 additions & 60 deletions
This file was deleted.

src/org/sosy_lab/java_smt/solvers/z3/Z3StringFormulaManager.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ class Z3StringFormulaManager extends AbstractStringFormulaManager<Long, Long, Lo
2525

2626
@Override
2727
protected Long makeStringImpl(String pValue) {
28-
return Native.mkString(z3context, pValue);
28+
return Native.mkString(z3context, escapeUnicodeForSmtlib(pValue));
2929
}
3030

3131
@Override

0 commit comments

Comments
 (0)