Skip to content

Commit e41fac8

Browse files
authored
Merge pull request #422 from sosy-lab/420-more-string-theory
#420: More operations in string theory
2 parents 08820e3 + 1a0a75e commit e41fac8

20 files changed

+570
-362
lines changed

src/org/sosy_lab/java_smt/api/FunctionDeclarationKind.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,8 @@ public enum FunctionDeclarationKind {
300300
STR_IN_RE,
301301
STR_TO_INT,
302302
INT_TO_STR,
303+
STR_FROM_CODE,
304+
STR_TO_CODE,
303305
STR_LT,
304306
STR_LE,
305307
RE_PLUS,

src/org/sosy_lab/java_smt/api/StringFormulaManager.java

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,23 @@
1414
import org.sosy_lab.java_smt.api.NumeralFormula.IntegerFormula;
1515

1616
/**
17-
* Manager for dealing with string formulas. Functions come from
18-
* http://smtlib.cs.uiowa.edu/theories-UnicodeStrings.shtml.
17+
* Manager for dealing with string formulas. Functions come from <a
18+
* href="http://smtlib.cs.uiowa.edu/theories-UnicodeStrings.shtml">String theory in SMTLIB</a>.
1919
*/
2020
public interface StringFormulaManager {
2121

2222
/**
23-
* Returns a {@link StringFormula} representing the given constant.
23+
* Creates a {@link StringFormula} representing the given constant String.
2424
*
25-
* @param value the string value the returned <code>Formula</code> should represent
26-
* @return a Formula representing the given value
25+
* <p>This method accepts plain Java Strings with Unicode characters from the Basic Multilingual
26+
* Plane (BMP) (codepoints in range [0x00000, 0x2FFFF]). JavaSMT handles escaping internally, as
27+
* some solvers follow the SMTLIB standard and escape Unicode characters with curly braces.
28+
*
29+
* <p>Additionally, you can use SMTLIB escaping like "\\u{1234}" to represent Unicode characters
30+
* directly.
31+
*
32+
* @param value the string value the returned {@link StringFormula} should represent
33+
* @return a {@link StringFormula} representing the given value
2734
*/
2835
StringFormula makeString(String value);
2936

@@ -71,9 +78,8 @@ public interface StringFormulaManager {
7178
IntegerFormula indexOf(StringFormula str, StringFormula part, IntegerFormula startIndex);
7279

7380
/**
74-
* Get a substring of length 1 from the given String.
75-
*
76-
* <p>The result is underspecified, if the index is out of bounds for the given String.
81+
* Get a substring of length 1 from the given String if the given index is within bounds.
82+
* Otherwise, returns an empty string.
7783
*/
7884
StringFormula charAt(StringFormula str, IntegerFormula index);
7985

@@ -231,4 +237,17 @@ default RegexFormula concat(RegexFormula... parts) {
231237
* It returns the empty string <code>""</code> for negative numbers.
232238
*/
233239
StringFormula toStringFormula(IntegerFormula number);
240+
241+
/**
242+
* Returns an Integer formula representing the code point of the only character of the given
243+
* String formula, if it represents a single character. Otherwise, returns -1.
244+
*/
245+
IntegerFormula toCodePoint(StringFormula str);
246+
247+
/**
248+
* Returns a String formula representing the single character with the given code point, if it is
249+
* a valid Unicode code point within the Basic Multilingual Plane (BMP) (codepoints in range
250+
* [0x00000, 0x2FFFF]). Otherwise, returns the empty string.
251+
*/
252+
StringFormula fromCodePoint(IntegerFormula codePoint);
234253
}

src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java

Lines changed: 77 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,15 @@
88

99
package org.sosy_lab.java_smt.basicimpl;
1010

11+
import static com.google.common.base.Preconditions.checkArgument;
1112
import static org.sosy_lab.java_smt.basicimpl.AbstractFormulaManager.checkVariableName;
1213

1314
import com.google.common.collect.Iterables;
1415
import com.google.common.collect.Lists;
1516
import java.util.Collections;
1617
import java.util.List;
18+
import java.util.regex.Matcher;
19+
import java.util.regex.Pattern;
1720
import org.sosy_lab.java_smt.api.BooleanFormula;
1821
import org.sosy_lab.java_smt.api.FormulaType;
1922
import org.sosy_lab.java_smt.api.NumeralFormula;
@@ -27,6 +30,16 @@ public abstract class AbstractStringFormulaManager<TFormulaInfo, TType, TEnv, TF
2730
extends AbstractBaseFormulaManager<TFormulaInfo, TType, TEnv, TFuncDecl>
2831
implements StringFormulaManager {
2932

33+
private static final Pattern UNICODE_ESCAPE_PATTERN =
34+
Pattern.compile(
35+
// start with "\\u"
36+
"\\\\u"
37+
// either a plain Unicode letter like "\\u0061"
38+
+ "((?<codePoint>[0-9a-fA-F]{4})"
39+
+ "|"
40+
// or curly brackets like "\\u{61}"
41+
+ "(\\{(?<codePointInBrackets>[0-9a-fA-F]{1,5})}))");
42+
3043
protected AbstractStringFormulaManager(
3144
FormulaCreator<TFormulaInfo, TType, TEnv, TFuncDecl> pCreator) {
3245
super(pCreator);
@@ -40,11 +53,59 @@ protected RegexFormula wrapRegex(TFormulaInfo formulaInfo) {
4053
return getFormulaCreator().encapsulateRegex(formulaInfo);
4154
}
4255

56+
protected IntegerFormula wrapInteger(TFormulaInfo formulaInfo) {
57+
return getFormulaCreator().encapsulate(FormulaType.IntegerType, formulaInfo);
58+
}
59+
4360
@Override
4461
public StringFormula makeString(String value) {
62+
checkArgument(
63+
areAllCodePointsInRange(value),
64+
"String constant is out of supported Unicode range (Plane 0-2).");
4565
return wrapString(makeStringImpl(value));
4666
}
4767

68+
/** returns whether all Unicode characters in Planes 0-2. */
69+
private static boolean areAllCodePointsInRange(String str) {
70+
return str.codePoints().allMatch(AbstractStringFormulaManager::isCodePointInRange);
71+
}
72+
73+
private static boolean isCodePointInRange(int codePoint) {
74+
return 0x00000 <= codePoint && codePoint <= 0x2FFFF;
75+
}
76+
77+
/** Replace Unicode letters in UTF16 representation with their escape sequences. */
78+
protected static String escapeUnicodeForSmtlib(String input) {
79+
StringBuilder sb = new StringBuilder();
80+
for (int codePoint : input.codePoints().toArray()) {
81+
if (0x20 <= codePoint && codePoint <= 0x7E) {
82+
sb.appendCodePoint(codePoint); // normal printable chars
83+
} else {
84+
sb.append("\\u{").append(String.format("%05X", codePoint)).append("}");
85+
}
86+
}
87+
return sb.toString();
88+
}
89+
90+
/** Replace escaped Unicode letters in SMTLIB representation with their UTF16 pendant. */
91+
public static String unescapeUnicodeForSmtlib(String input) {
92+
Matcher matcher = UNICODE_ESCAPE_PATTERN.matcher(input);
93+
StringBuilder sb = new StringBuilder();
94+
while (matcher.find()) {
95+
String hexCodePoint = matcher.group("codePoint");
96+
if (hexCodePoint == null) {
97+
hexCodePoint = matcher.group("codePointInBrackets");
98+
}
99+
int codePoint = Integer.parseInt(hexCodePoint, 16);
100+
checkArgument(
101+
isCodePointInRange(codePoint),
102+
"SMTLIB does only specify Unicode letters from Planes 0-2");
103+
matcher.appendReplacement(sb, Character.toString(codePoint));
104+
}
105+
matcher.appendTail(sb);
106+
return sb.toString();
107+
}
108+
48109
protected abstract TFormulaInfo makeStringImpl(String value);
49110

50111
@Override
@@ -141,10 +202,7 @@ public BooleanFormula contains(StringFormula str, StringFormula part) {
141202

142203
@Override
143204
public IntegerFormula indexOf(StringFormula str, StringFormula part, IntegerFormula startIndex) {
144-
return getFormulaCreator()
145-
.encapsulate(
146-
FormulaType.IntegerType,
147-
indexOf(extractInfo(str), extractInfo(part), extractInfo(startIndex)));
205+
return wrapInteger(indexOf(extractInfo(str), extractInfo(part), extractInfo(startIndex)));
148206
}
149207

150208
protected abstract TFormulaInfo indexOf(
@@ -287,8 +345,7 @@ public RegexFormula times(RegexFormula regex, int repetitions) {
287345

288346
@Override
289347
public IntegerFormula toIntegerFormula(StringFormula str) {
290-
return getFormulaCreator()
291-
.encapsulate(FormulaType.IntegerType, toIntegerFormula(extractInfo(str)));
348+
return wrapInteger(toIntegerFormula(extractInfo(str)));
292349
}
293350

294351
protected abstract TFormulaInfo toIntegerFormula(TFormulaInfo pParam);
@@ -299,4 +356,18 @@ public StringFormula toStringFormula(IntegerFormula number) {
299356
}
300357

301358
protected abstract TFormulaInfo toStringFormula(TFormulaInfo pParam);
359+
360+
@Override
361+
public IntegerFormula toCodePoint(StringFormula number) {
362+
return wrapInteger(toCodePoint(extractInfo(number)));
363+
}
364+
365+
protected abstract TFormulaInfo toCodePoint(TFormulaInfo pParam);
366+
367+
@Override
368+
public StringFormula fromCodePoint(IntegerFormula number) {
369+
return wrapString(fromCodePoint(extractInfo(number)));
370+
}
371+
372+
protected abstract TFormulaInfo fromCodePoint(TFormulaInfo pParam);
302373
}

src/org/sosy_lab/java_smt/delegate/debugging/DebuggingStringFormulaManager.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -354,4 +354,22 @@ public StringFormula toStringFormula(IntegerFormula number) {
354354
debugging.addFormulaTerm(result);
355355
return result;
356356
}
357+
358+
@Override
359+
public IntegerFormula toCodePoint(StringFormula str) {
360+
debugging.assertThreadLocal();
361+
debugging.assertFormulaInContext(str);
362+
IntegerFormula result = delegate.toCodePoint(str);
363+
debugging.addFormulaTerm(result);
364+
return result;
365+
}
366+
367+
@Override
368+
public StringFormula fromCodePoint(IntegerFormula codepoint) {
369+
debugging.assertThreadLocal();
370+
debugging.assertFormulaInContext(codepoint);
371+
StringFormula result = delegate.fromCodePoint(codepoint);
372+
debugging.addFormulaTerm(result);
373+
return result;
374+
}
357375
}

src/org/sosy_lab/java_smt/delegate/statistics/StatisticsStringFormulaManager.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,4 +233,16 @@ public StringFormula toStringFormula(IntegerFormula number) {
233233
stats.stringOperations.getAndIncrement();
234234
return delegate.toStringFormula(number);
235235
}
236+
237+
@Override
238+
public IntegerFormula toCodePoint(StringFormula str) {
239+
stats.stringOperations.getAndIncrement();
240+
return delegate.toCodePoint(str);
241+
}
242+
243+
@Override
244+
public StringFormula fromCodePoint(IntegerFormula codepoint) {
245+
stats.stringOperations.getAndIncrement();
246+
return delegate.fromCodePoint(codepoint);
247+
}
236248
}

src/org/sosy_lab/java_smt/delegate/synchronize/SynchronizedStringFormulaManager.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,4 +268,18 @@ public StringFormula toStringFormula(IntegerFormula number) {
268268
return delegate.toStringFormula(number);
269269
}
270270
}
271+
272+
@Override
273+
public IntegerFormula toCodePoint(StringFormula str) {
274+
synchronized (sync) {
275+
return delegate.toCodePoint(str);
276+
}
277+
}
278+
279+
@Override
280+
public StringFormula fromCodePoint(IntegerFormula codepoint) {
281+
synchronized (sync) {
282+
return delegate.fromCodePoint(codepoint);
283+
}
284+
}
271285
}

src/org/sosy_lab/java_smt/solvers/cvc4/CVC4FormulaCreator.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
import static com.google.common.base.Preconditions.checkArgument;
1212
import static com.google.common.base.Preconditions.checkState;
13+
import static org.sosy_lab.java_smt.basicimpl.AbstractStringFormulaManager.unescapeUnicodeForSmtlib;
1314

1415
import com.google.common.base.Preconditions;
1516
import com.google.common.collect.ImmutableList;
@@ -511,6 +512,8 @@ private Expr normalize(Expr operator) {
511512
.put(Kind.STRING_IN_REGEXP, FunctionDeclarationKind.STR_IN_RE)
512513
.put(Kind.STRING_STOI, FunctionDeclarationKind.STR_TO_INT)
513514
.put(Kind.STRING_ITOS, FunctionDeclarationKind.INT_TO_STR)
515+
.put(Kind.STRING_TO_CODE, FunctionDeclarationKind.STR_TO_CODE)
516+
.put(Kind.STRING_FROM_CODE, FunctionDeclarationKind.STR_FROM_CODE)
514517
.put(Kind.STRING_LT, FunctionDeclarationKind.STR_LT)
515518
.put(Kind.STRING_LEQ, FunctionDeclarationKind.STR_LE)
516519
.put(Kind.REGEXP_PLUS, FunctionDeclarationKind.RE_PLUS)
@@ -609,7 +612,7 @@ public Object convertValue(Expr expForType, Expr value) {
609612
return convertFloatingPoint(value);
610613

611614
} else if (valueType.isString()) {
612-
return value.getConstString().toString();
615+
return unescapeUnicodeForSmtlib(value.getConstString().toString());
613616

614617
} else {
615618
// String serialization for unknown terms.

src/org/sosy_lab/java_smt/solvers/cvc4/CVC4StringFormulaManager.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ class CVC4StringFormulaManager extends AbstractStringFormulaManager<Expr, Type,
3030
@Override
3131
protected Expr makeStringImpl(String pValue) {
3232
// The boolean enables escape characters!
33-
return exprManager.mkConst(new CVC4String(pValue, true));
33+
return exprManager.mkConst(new CVC4String(escapeUnicodeForSmtlib(pValue), true));
3434
}
3535

3636
@Override
@@ -190,4 +190,14 @@ protected Expr toIntegerFormula(Expr pParam) {
190190
protected Expr toStringFormula(Expr pParam) {
191191
return exprManager.mkExpr(Kind.STRING_ITOS, pParam);
192192
}
193+
194+
@Override
195+
protected Expr toCodePoint(Expr pParam) {
196+
return exprManager.mkExpr(Kind.STRING_TO_CODE, pParam);
197+
}
198+
199+
@Override
200+
protected Expr fromCodePoint(Expr pParam) {
201+
return exprManager.mkExpr(Kind.STRING_FROM_CODE, pParam);
202+
}
193203
}

src/org/sosy_lab/java_smt/solvers/cvc4/CVC4TheoremProver.java

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import com.google.common.base.Preconditions;
1212
import com.google.common.collect.Collections2;
1313
import com.google.common.collect.ImmutableList;
14+
import edu.stanford.CVC4.Exception;
1415
import edu.stanford.CVC4.Expr;
1516
import edu.stanford.CVC4.ExprManager;
1617
import edu.stanford.CVC4.ExprManagerMapCollection;
@@ -123,16 +124,25 @@ protected void popImpl() {
123124
}
124125

125126
@Override
126-
protected @Nullable Void addConstraintImpl(BooleanFormula pF) throws InterruptedException {
127+
protected @Nullable Void addConstraintImpl(BooleanFormula pF)
128+
throws InterruptedException, SolverException {
127129
Preconditions.checkState(!closed);
128130
setChanged();
129-
Expr exp = creator.extractInfo(pF);
130131
if (incremental) {
131-
smtEngine.assertFormula(importExpr(exp));
132+
assertFormula(pF);
132133
}
133134
return null;
134135
}
135136

137+
private void assertFormula(BooleanFormula pF) throws SolverException {
138+
try {
139+
smtEngine.assertFormula(importExpr(creator.extractInfo(pF)));
140+
} catch (Exception cvc4Exception) {
141+
throw new SolverException(
142+
String.format("CVC4 crashed while adding the constraint '%s'", pF), cvc4Exception);
143+
}
144+
}
145+
136146
@SuppressWarnings("resource")
137147
@Override
138148
public CVC4Model getModel() throws InterruptedException, SolverException {
@@ -188,7 +198,9 @@ public boolean isUnsat() throws InterruptedException, SolverException {
188198
closeAllEvaluators();
189199
changedSinceLastSatQuery = false;
190200
if (!incremental) {
191-
getAssertedFormulas().forEach(f -> smtEngine.assertFormula(creator.extractInfo(f)));
201+
for (BooleanFormula f : getAssertedFormulas()) {
202+
assertFormula(f);
203+
}
192204
}
193205

194206
Result result;

src/org/sosy_lab/java_smt/solvers/cvc5/CVC5FormulaCreator.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -626,6 +626,8 @@ private Term normalize(Term operator) {
626626
.put(Kind.STRING_IN_REGEXP, FunctionDeclarationKind.STR_IN_RE)
627627
.put(Kind.STRING_FROM_INT, FunctionDeclarationKind.INT_TO_STR)
628628
.put(Kind.STRING_TO_INT, FunctionDeclarationKind.STR_TO_INT)
629+
.put(Kind.STRING_TO_CODE, FunctionDeclarationKind.STR_TO_CODE)
630+
.put(Kind.STRING_FROM_CODE, FunctionDeclarationKind.STR_FROM_CODE)
629631
.put(Kind.STRING_LT, FunctionDeclarationKind.STR_LT)
630632
.put(Kind.STRING_LEQ, FunctionDeclarationKind.STR_LE)
631633
.put(Kind.REGEXP_PLUS, FunctionDeclarationKind.RE_PLUS)

0 commit comments

Comments
 (0)