@@ -50,22 +50,24 @@ public class ZipSplitter implements Splitter<BytesHandle> {
5050 /**
5151 * Returns the extensionFormats set to splitter. The extensionFormat is pre-defined in splitter.
5252 * It includes "json", "txt" and "xml" extensions. If the file has no extension, it is treated as binary file.
53+ * You can also add mappings from other extensions in the zipfile to one of the four MarkLogic formats.
5354 * @return a map of extensionFormats
5455 */
5556 public Map <String , Format > getExtensionFormats () {
5657 return this .extensionFormats ;
5758 }
5859
5960 /**
60- * Returns the entryFileter set to splitter.
61- * @return the entryFileter set to splitter
61+ * Returns the entryFilter set to splitter.
62+ * @return the entryFilter set to splitter
6263 */
6364 public Predicate <ZipEntry > getEntryFilter () {
6465 return this .entryFilter ;
6566 }
6667
6768 /**
68- * Used to set entryFilter to splitter.
69+ * Used to set entryFilter to splitter. The entryFilter is a lambda function, which can be used to inspect
70+ * the zip entry and return false for any document in the zipfile that should be ignored.
6971 * @param entryFilter the filter that applied to each zipEntry
7072 */
7173 public void setEntryFilter (Predicate <ZipEntry > entryFilter ) {
@@ -81,7 +83,8 @@ public Function<String, String> getUriTransformer() {
8183 }
8284
8385 /**
84- * Used to set uriTransformer to splitter
86+ * Used to set uriTransformer to splitter. The uriTransformer is a lambda function, which can be used to
87+ * transform the name of the document in the zipfile into the document URI for the database.
8588 * @param uriTransformer the uriTransformer which applied on each document URI
8689 */
8790 public void setUriTransformer (Function <String , String > uriTransformer ) {
@@ -106,6 +109,7 @@ public long getCount() {
106109
107110 /**
108111 * Takes a input stream of a ZIP file and convert it to a stream of BytesHandle.
112+ * The input stream must be a ZipInputStream, otherwise it will throw an exception.
109113 * The ZIP file could contain XML, JSON, TXT and BINARY files.
110114 * @param input is the incoming input stream
111115 * @return a stream of BytesHandle
@@ -140,9 +144,8 @@ public Stream<BytesHandle> split(ZipInputStream input) throws IOException {
140144 bytesHandleSpliterator .setZipStream (input );
141145 bytesHandleSpliterator .setEntryFilter (this .entryFilter );
142146 bytesHandleSpliterator .setExtensionFormats (this .extensionFormats );
143- bytesHandleSpliterator .setUriTransformer (this .uriTransformer );
144147
145- return StreamSupport .stream (bytesHandleSpliterator , false );
148+ return StreamSupport .stream (bytesHandleSpliterator , true );
146149 }
147150
148151 /**
@@ -162,9 +165,8 @@ public Stream<DocumentWriteOperation> splitWriteOperations(ZipInputStream input)
162165 documentWriteOperationSpliterator .setZipStream (input );
163166 documentWriteOperationSpliterator .setEntryFilter (this .entryFilter );
164167 documentWriteOperationSpliterator .setExtensionFormats (this .extensionFormats );
165- documentWriteOperationSpliterator .setUriTransformer (this .uriTransformer );
166168
167- return StreamSupport .stream (documentWriteOperationSpliterator , false );
169+ return StreamSupport .stream (documentWriteOperationSpliterator , true );
168170 }
169171
170172 private static class FormatEntry {
@@ -201,7 +203,6 @@ private static abstract class ZipEntrySpliterator<T> extends Spliterators.Abstra
201203 private ZipInputStream zipStream ;
202204 private Map <String ,Format > extensionFormats ;
203205 private Predicate <ZipEntry > entryFilter ;
204- private Function <String , String > uriTransformer ;
205206
206207 ZipEntrySpliterator (long est , int additionalCharacteristics ) {
207208 super (est , additionalCharacteristics );
@@ -238,27 +239,15 @@ void setEntryFilter(Predicate<ZipEntry> entryFilter) {
238239 this .entryFilter = entryFilter ;
239240 }
240241
241- Function <String , String > getUriTransformer () {
242- return this .uriTransformer ;
243- }
244-
245- void setUriTransformer (Function <String , String > uriTransformer ) {
246- this .uriTransformer = uriTransformer ;
247- }
248-
249242 protected FormatEntry getNextEntry () throws IOException {
250243 ZipEntry candidateEntry ;
251244
252245 while ((candidateEntry = getZipStream ().getNextEntry ()) != null ) {
253- if (getEntryFilter () != null && getEntryFilter ().test (candidateEntry ) == false ) {
246+ if (getEntryFilter () != null && ! getEntryFilter ().test (candidateEntry )) {
254247 continue ;
255248 }
256249
257250 String name = candidateEntry .getName ();
258- if (getUriTransformer () != null ) {
259- name = getUriTransformer ().apply (name );
260- }
261-
262251 Matcher matcher = extensionRegex .matcher (name );
263252 matcher .find ();
264253 String extension = matcher .group (1 );
@@ -311,7 +300,7 @@ public boolean tryAdvance(Consumer<? super BytesHandle> action) {
311300 splitter .count ++;
312301
313302 } catch (IOException e ) {
314- e . printStackTrace ( );
303+ throw new RuntimeException ( "Could not read ZipEntry" , e );
315304 }
316305
317306 return true ;
@@ -340,8 +329,8 @@ public boolean tryAdvance(Consumer<? super DocumentWriteOperation> action) {
340329 String name = nextEntry .getZipEntry ().getName ();
341330
342331 String uri = name ;
343- if (getUriTransformer () != null ) {
344- uri = getUriTransformer () .apply (name );
332+ if (splitter . uriTransformer != null ) {
333+ uri = splitter . uriTransformer .apply (name );
345334 }
346335
347336 DocumentWriteOperationImpl documentWriteOperation = new DocumentWriteOperationImpl (
@@ -354,7 +343,7 @@ public boolean tryAdvance(Consumer<? super DocumentWriteOperation> action) {
354343 splitter .count ++;
355344
356345 } catch (IOException e ) {
357- e . printStackTrace ( );
346+ throw new RuntimeException ( "Could not read ZipEntry" , e );
358347 }
359348
360349 return true ;
0 commit comments