|
3 | 3 | import java.io.BufferedReader; |
4 | 4 | import java.io.IOException; |
5 | 5 | import java.io.StringReader; |
| 6 | +import java.util.ArrayList; |
6 | 7 | import java.util.List; |
7 | 8 |
|
8 | 9 | import edu.stanford.nlp.io.RuntimeIOException; |
@@ -51,166 +52,95 @@ public class UniversalPOSMapper { |
51 | 52 | private UniversalPOSMapper() {} // static methods |
52 | 53 |
|
53 | 54 | public static void load() { |
54 | | - String newLine = System.lineSeparator(); |
55 | | - String rawPattern = String.join(newLine, |
56 | | - // ------------------------------ |
57 | | - // Context-sensitive mappings |
| 55 | + operations = new ArrayList<>(); |
| 56 | + // ------------------------------ |
| 57 | + // Context-sensitive mappings |
| 58 | + // ------------------------------ |
58 | 59 |
|
59 | | - // TO -> PART (in CONJP phrases) |
60 | | - "@CONJP < TO=target < VB", |
61 | | - "", |
62 | | - "relabel target PART", |
63 | | - "", |
| 60 | + // TO -> PART (in CONJP phrases) |
| 61 | + String [][] contextMappings = new String [][] { |
| 62 | + { "@CONJP < TO=target < VB", "PART", }, |
| 63 | + { "@VP < @VP < (/^TO$/=target <... {/.*/})", "PART", }, |
| 64 | + { "@VP <: (/^TO$/=target <... {/.*/})", "PART", }, |
| 65 | + { "TO=target <... {/.*/}", "ADP", }, // otherwise TO -> ADP |
| 66 | + // Don't do this, we are now treating these as copular constructions |
| 67 | + // VB.* -> AUX (for passives where main verb is part of an ADJP) |
| 68 | + // @VP < (/^VB/=target < /^(?i:am|is|are|r|be|being|'s|'re|'m|was|were|been|s|ai|m|art|ar|wase|get|got|getting|gets|gotten)$/ ) < (@ADJP [ < VBN|VBD | < (@VP|ADJP < VBN|VBD) < CC ] ) |
| 69 | + //relabel target AUX", |
64 | 70 |
|
65 | | - // TO -> PART |
66 | | - "@VP < @VP < (/^TO$/=target <... {/.*/})", |
67 | | - "", |
68 | | - "relabel target PART", |
69 | | - "", |
| 71 | + // VB.* -> AUX (for cases with fronted main VPs) |
| 72 | + { "@SINV < (@VP < (/^VB/=target < /^(?i:am|is|are|r|be|being|'s|'re|'m|was|were|been|s|ai|m|art|ar|wase)$/ ) $-- (@VP < VBD|VBN))", |
| 73 | + "AUX", }, |
| 74 | + // VB.* -> AUX (another, rarer case of fronted VPs) |
| 75 | + { "@SINV < (@VP < (@VP < (/^VB/=target < /^(?i:am|is|are|r|be|being|'s|'re|'m|was|were|been|s|ai|m|art|ar|wase)$/ )) $-- (@VP < VBD|VBN))", |
| 76 | + "AUX", }, |
70 | 77 |
|
71 | | - // TO -> PART |
72 | | - "@VP <: (/^TO$/=target <... {/.*/})", |
73 | | - "", |
74 | | - "relabel target PART", |
75 | | - "", |
| 78 | + // VB.* -> AUX (passive, case 2) |
| 79 | + //"%SQ|SINV < (/^VB/=target < /^(?i:am|is|are|r|be|being|'s|'re|'m|was|were|been|s|ai|m|art|ar|wase)$/ $++ (VP < VBD|VBN))", |
| 80 | + //"%relabel target AUX", |
| 81 | + // VB.* -> AUX (active, case 1) |
| 82 | + { "VP < VP < (/^VB.*$/=target <: /^(?i:will|have|can|would|do|is|was|be|are|has|could|should|did|been|may|were|had|'ll|'ve|does|am|might|ca|'m|being|'s|must|'d|'re|wo|shall|get|ve|s|got|r|m|getting|having|d|re|ll|wilt|v|of|my|nt|gets|du|wud|woud|with|willl|wil|wase|shoul|shal|`s|ould|-ll|most|made|hvae|hav|cold|as|art|ai|ar|a)$/)", |
| 83 | + "AUX", }, |
76 | 84 |
|
77 | | - // TO -> ADP (otherwise) |
78 | | - "TO=target <... {/.*/}", |
79 | | - "", |
80 | | - "relabel target ADP", |
81 | | - "", |
| 85 | + // VB -> AUX (active, case 2) |
| 86 | + { "@SQ|SINV < (/^VB/=target $++ /^(?:VP)/ <... {/.*/})", "AUX" }, |
82 | 87 |
|
83 | | - // Don't do this, we are now treating these as copular constructions |
84 | | - // VB.* -> AUX (for passives where main verb is part of an ADJP) |
85 | | - // @VP < (/^VB/=target < /^(?i:am|is|are|r|be|being|'s|'re|'m|was|were|been|s|ai|m|art|ar|wase|get|got|getting|gets|gotten)$/ ) < (@ADJP [ < VBN|VBD | < (@VP|ADJP < VBN|VBD) < CC ] ) |
86 | | - //relabel target AUX", |
| 88 | + // otherwise, VB.* -> VERB |
| 89 | + { "/^VB.*/=target <... {/.*/}", "VERB", }, |
87 | 90 |
|
88 | | - // VB.* -> AUX (for cases with fronted main VPs) |
89 | | - "@SINV < (@VP < (/^VB/=target < /^(?i:am|is|are|r|be|being|'s|'re|'m|was|were|been|s|ai|m|art|ar|wase)$/ ) $-- (@VP < VBD|VBN))", |
90 | | - "", |
91 | | - "relabel target AUX", |
92 | | - "", |
| 91 | + // IN -> SCONJ (subordinating conjunctions) |
| 92 | + { "/^SBAR(-[^ ]+)?$/ < (IN=target $++ @S|FRAG|SBAR|SINV <... {/.*/})", "SCONJ", }, |
93 | 93 |
|
94 | | - // VB.* -> AUX (another, rarer case of fronted VPs) |
95 | | - "@SINV < (@VP < (@VP < (/^VB/=target < /^(?i:am|is|are|r|be|being|'s|'re|'m|was|were|been|s|ai|m|art|ar|wase)$/ )) $-- (@VP < VBD|VBN))", |
96 | | - "", |
97 | | - "relabel target AUX", |
98 | | - "", |
99 | | - |
100 | | - // VB.* -> AUX (passive, case 2) |
101 | | - "%SQ|SINV < (/^VB/=target < /^(?i:am|is|are|r|be|being|'s|'re|'m|was|were|been|s|ai|m|art|ar|wase)$/ $++ (VP < VBD|VBN))", |
102 | | - "", |
103 | | - "%relabel target AUX", |
104 | | - "", |
105 | | - |
106 | | - // VB.* -> AUX (active, case 1) |
107 | | - "VP < VP < (/^VB.*$/=target <: /^(?i:will|have|can|would|do|is|was|be|are|has|could|should|did|been|may|were|had|'ll|'ve|does|am|might|ca|'m|being|'s|must|'d|'re|wo|shall|get|ve|s|got|r|m|getting|having|d|re|ll|wilt|v|of|my|nt|gets|du|wud|woud|with|willl|wil|wase|shoul|shal|`s|ould|-ll|most|made|hvae|hav|cold|as|art|ai|ar|a)$/)", |
108 | | - "", |
109 | | - "relabel target AUX", |
110 | | - "", |
111 | | - |
112 | | - // VB -> AUX (active, case 2) |
113 | | - "@SQ|SINV < (/^VB/=target $++ /^(?:VP)/ <... {/.*/})", |
114 | | - "", |
115 | | - "relabel target AUX", |
116 | | - "", |
117 | | - |
118 | | - // VB.* -> VERB |
119 | | - "/^VB.*/=target <... {/.*/}", |
120 | | - "", |
121 | | - "relabel target VERB", |
122 | | - "", |
123 | | - |
124 | | - // IN -> SCONJ (subordinating conjunctions) |
125 | | - "/^SBAR(-[^ ]+)?$/ < (IN=target $++ @S|FRAG|SBAR|SINV <... {/.*/})", |
126 | | - "", |
127 | | - "relabel target SCONJ", |
128 | | - "", |
129 | | - |
130 | | - // IN -> SCONJ (subordinating conjunctions II) |
131 | | - "@PP < (IN=target $+ @SBAR|S)", |
132 | | - "", |
133 | | - "relabel target SCONJ", |
134 | | - "", |
| 94 | + // IN -> SCONJ (subordinating conjunctions II) |
| 95 | + { "@PP < (IN=target $+ @SBAR|S)", "SCONJ" }, |
135 | 96 |
|
136 | | - // IN -> ADP (otherwise) |
137 | | - "IN=target < __", |
138 | | - "", |
139 | | - "relabel target ADP", |
140 | | - "", |
| 97 | + // IN -> ADP (otherwise) |
| 98 | + { "IN=target < __", "ADP" }, |
141 | 99 |
|
142 | | - // NN -> SYM (in case of the percent sign) |
143 | | - "NN=target <... {/\\\\%/}", |
144 | | - "", |
145 | | - "relabel target SYM", |
146 | | - "", |
| 100 | + // NN -> SYM (in case of the percent sign) |
| 101 | + { "NN=target <... {/[%]/}", "SYM" }, |
147 | 102 |
|
148 | | - // fused det-noun pronouns -> PRON |
149 | | - "NN=target < (/^(?i:(somebody|something|someone|anybody|anything|anyone|everybody|everything|everyone|nobody|nothing))$/)", |
150 | | - "", |
151 | | - "relabel target PRON", |
152 | | - "", |
| 103 | + // fused det-noun pronouns -> PRON |
| 104 | + { "NN=target < (/^(?i:(somebody|something|someone|anybody|anything|anyone|everybody|everything|everyone|nobody|nothing))$/)", |
| 105 | + "PRON" }, |
153 | 106 |
|
154 | | - // NN -> NOUN (otherwise) |
155 | | - "NN=target <... {/.*/}", |
156 | | - "", |
157 | | - "relabel target NOUN", |
158 | | - "", |
| 107 | + // NN -> NOUN (otherwise) |
| 108 | + { "NN=target <... {/.*/}", "NOUN" }, |
159 | 109 |
|
160 | | - // NFP -> PUNCT (in case of possibly repeated hyphens, asterisks or tildes) |
161 | | - "NFP=target <... {/^(~+|\\*+|\\-+)$/}", |
162 | | - "", |
163 | | - "relabel target PUNCT", |
164 | | - "", |
| 110 | + // NFP -> PUNCT (in case of possibly repeated hyphens, asterisks or tildes) |
| 111 | + { "NFP=target <... {/^(~+|\\*+|\\-+)$/}", "PUNCT", }, |
165 | 112 |
|
166 | | - // NFP -> SYM (otherwise) |
167 | | - "NFP=target <... {/.*/}", |
168 | | - "", |
169 | | - "relabel target SYM", |
170 | | - "", |
| 113 | + // NFP -> SYM (otherwise) |
| 114 | + { "NFP=target <... {/.*/}", "SYM" }, |
171 | 115 |
|
172 | | - // RB -> PART when it is verbal negation (not or its reductions) |
173 | | - "@VP|SINV|SQ|FRAG|ADVP < (RB=target < /^(?i:not|n't|nt|t|n)$/)", |
174 | | - "", |
175 | | - "relabel target PART", |
176 | | - "", |
| 116 | + // RB -> PART when it is verbal negation (not or its reductions) |
| 117 | + { "@VP|SINV|SQ|FRAG|ADVP < (RB=target < /^(?i:not|n't|nt|t|n)$/)", "PART" }, |
177 | 118 |
|
178 | | - // Otherwise RB -> ADV |
179 | | - "RB=target <... {/.*/}", |
180 | | - "", |
181 | | - "relabel target ADV", |
182 | | - "", |
| 119 | + // Otherwise RB -> ADV |
| 120 | + { "RB=target <... {/.*/}", "ADV" }, |
183 | 121 |
|
184 | | - // DT -> PRON (pronominal this/that/these/those) |
185 | | - "@NP <: (DT=target < /^(?i:th(is|at|ose|ese))$/)", |
186 | | - "", |
187 | | - "relabel target PRON", |
188 | | - "", |
| 122 | + // DT -> PRON (pronominal this/that/these/those) |
| 123 | + { "@NP <: (DT=target < /^(?i:th(is|at|ose|ese))$/)", "PRON", }, |
189 | 124 |
|
190 | | - // DT -> DET |
191 | | - "DT=target < __", |
192 | | - "", |
193 | | - "relabel target DET", |
194 | | - "", |
| 125 | + // DT -> DET |
| 126 | + { "DT=target < __", "DET" }, |
195 | 127 |
|
196 | | - // WDT -> PRON (pronominal that/which) |
197 | | - "@WHNP|NP <: (WDT=target < /^(?i:(that|which))$/)", |
198 | | - "", |
199 | | - "relabel target PRON", |
200 | | - "", |
| 128 | + // WDT -> PRON (pronominal that/which) |
| 129 | + { "@WHNP|NP <: (WDT=target < /^(?i:(that|which))$/)", "PRON" }, |
201 | 130 |
|
202 | | - // WDT->SCONJ (incorrectly tagged subordinating conjunctions) |
203 | | - "@SBAR < (WDT=target < /^(?i:(that|which))$/)", |
204 | | - "", |
205 | | - "relabel target SCONJ", |
206 | | - "", |
| 131 | + // WDT->SCONJ (incorrectly tagged subordinating conjunctions) |
| 132 | + { "@SBAR < (WDT=target < /^(?i:(that|which))$/)", "SCONJ" }, |
207 | 133 |
|
208 | | - // WDT -> DET |
209 | | - "WDT=target <... {/.*/}", |
210 | | - "", |
211 | | - "relabel target DET", |
212 | | - "", |
| 134 | + // WDT -> DET |
| 135 | + { "WDT=target <... {/.*/}", "DET" }, |
| 136 | + }; |
| 137 | + for (String[] newOp : contextMappings) { |
| 138 | + operations.add(new Pair<>(TregexPattern.compile(newOp[0]), |
| 139 | + Tsurgeon.parseOperation("relabel target " + newOp[1]))); |
213 | 140 |
|
| 141 | + } |
| 142 | + String newLine = System.lineSeparator(); |
| 143 | + String rawPattern = String.join(newLine, |
214 | 144 | // ------------------------------ |
215 | 145 | // 1 to 1 mappings |
216 | 146 | // ------------------------------ |
@@ -436,7 +366,8 @@ public static void load() { |
436 | 366 | "relabel target X"); |
437 | 367 | StringReader reader = new StringReader(rawPattern); |
438 | 368 | try (BufferedReader buffered = new BufferedReader(reader)) { |
439 | | - operations = Tsurgeon.getOperationsFromReader(buffered, new TregexPatternCompiler()); |
| 369 | + List<Pair<TregexPattern, TsurgeonPattern>> newOperations = Tsurgeon.getOperationsFromReader(buffered, new TregexPatternCompiler()); |
| 370 | + operations.addAll(newOperations); |
440 | 371 | } catch (IOException e) { |
441 | 372 | throw new RuntimeIOException(e); |
442 | 373 | } |
|
0 commit comments