2323 */
2424package org .opengrok .indexer .analysis ;
2525
26+ import java .io .BufferedInputStream ;
2627import java .io .BufferedReader ;
2728import java .io .File ;
29+ import java .io .FileInputStream ;
2830import java .io .FileWriter ;
2931import java .io .IOException ;
3032import java .io .InputStream ;
4749import java .util .TreeSet ;
4850import java .util .logging .Level ;
4951import java .util .logging .Logger ;
52+
5053import org .apache .lucene .document .DateTools ;
5154import org .apache .lucene .document .Document ;
5255import org .apache .lucene .document .Field ;
130133 */
131134public class AnalyzerGuru {
132135
136+ /**
137+ * A value used as a placeholder for a filename when content is anonymous
138+ * (e.g. from temporary source or from a stream for which an identifier is
139+ * not available).
140+ */
141+ public static final String ANONYMOUS_NAME = "<anonymous>" ;
142+
133143 /**
134144 * The maximum number of characters (multi-byte if a BOM is identified) to
135145 * read from the input stream to be used for magic string matching.
@@ -551,29 +561,92 @@ public static AbstractAnalyzer getAnalyzer(String fileTypeName) {
551561 }
552562
553563 /**
554- * Get an analyzer suited to analyze a file. This function will reuse
555- * analyzers since they are costly .
564+ * Gets an analyzer factory suited to analyze a file, but without a check
565+ * for Huge Text since the file size is not available .
556566 *
557567 * @param in Input stream containing data to be analyzed
558- * @param file Name of the file to be analyzed
559- * @return An analyzer suited for that file content
568+ * @param fileName Name of the file to be analyzed
569+ * @return An analyzer factory suited for that file content
560570 * @throws java.io.IOException If an error occurs while accessing the data
561571 * in the input stream.
562572 */
563- public static AbstractAnalyzer getAnalyzer (InputStream in , String file ) throws IOException {
564- AnalyzerFactory factory = find (in , file );
573+ public static AnalyzerFactory getAnalyzerFactory (InputStream in , String fileName )
574+ throws IOException {
575+ AnalyzerFactory factory = find (in , fileName );
565576 if (factory == null ) {
566- AbstractAnalyzer defaultAnalyzer = getAnalyzer () ;
577+ factory = DEFAULT_ANALYZER_FACTORY ;
567578 if (LOGGER .isLoggable (Level .FINEST )) {
579+ AbstractAnalyzer defaultAnalyzer = factory .getAnalyzer ();
568580 LOGGER .log (Level .FINEST , "{0}: fallback {1}" ,
569- new Object []{file ,
570- defaultAnalyzer .getClass ().getSimpleName () });
581+ new Object []{fileName , defaultAnalyzer .getClass ().getSimpleName ()});
571582 }
572- return defaultAnalyzer ;
573583 }
584+ return factory ;
585+ }
586+
587+ /**
588+ * Gets an analyzer suited to analyze a file, but without a check for Huge
589+ * Text since the file size is not available.
590+ *
591+ * @param in Input stream containing data to be analyzed
592+ * @param fileName Name of the file to be analyzed
593+ * @return An analyzer factory suited for the file content
594+ * @throws java.io.IOException If an error occurs while accessing the data
595+ * in the input stream.
596+ */
597+ public static AbstractAnalyzer getAnalyzer (InputStream in , String fileName )
598+ throws IOException {
599+ AnalyzerFactory factory = getAnalyzerFactory (in , fileName );
574600 return factory .getAnalyzer ();
575601 }
576602
603+ /**
604+ * Gets an analyzer factory suited to analyze a file, with a check for Huge
605+ * Text.
606+ *
607+ * @param file a defined instance to be analyzed
608+ * @param path Name (possibly normalized) of the file to be analyzed
609+ * @param logHugeText a value indicating whether to log if the file is
610+ * identified as Huge Text
611+ * @return An analyzer factory suited for the file content
612+ * @throws java.io.IOException If an error occurs while reading the file
613+ */
614+ public static AnalyzerFactory getAnalyzerFactory (File file , String path , boolean logHugeText )
615+ throws IOException {
616+
617+ AnalyzerFactory fac ;
618+ try (InputStream in = new BufferedInputStream (
619+ new FileInputStream (file ))) {
620+ fac = AnalyzerGuru .getAnalyzerFactory (in , path );
621+ }
622+
623+ if (AbstractAnalyzer .Genre .PLAIN .equals (fac .getGenre ()) &&
624+ file .length () >= RuntimeEnvironment .getInstance ().getHugeTextThresholdBytes ()) {
625+ fac = HugeTextAnalyzerFactory .DEFAULT_INSTANCE ;
626+ if (logHugeText && LOGGER .isLoggable (Level .WARNING )) {
627+ String origFileTypeName = fac .getAnalyzer ().getFileTypeName ();
628+ LOGGER .log (Level .WARNING , "{0} is huge text: {1}" ,
629+ new Object []{origFileTypeName , path });
630+ }
631+ }
632+ return fac ;
633+ }
634+
635+ /**
636+ * Get an analyzer suited to analyze a file, with a check for Huge Text.
637+ *
638+ * @param file a defined instance to be analyzed
639+ * @param path Name (possibly normalized) of the file to be analyzed
640+ * @param logHugeText a value indicating whether to log if the file is
641+ * identified as Huge Text
642+ * @return An analyzer suited for the file content
643+ * @throws java.io.IOException If an error occurs while reading the file
644+ */
645+ public static AbstractAnalyzer getAnalyzer (File file , String path , boolean logHugeText )
646+ throws IOException {
647+ return getAnalyzerFactory (file , path , logHugeText ).getAnalyzer ();
648+ }
649+
577650 /**
578651 * Free resources associated with all registered analyzers.
579652 */
@@ -718,24 +791,36 @@ public static void writeDumpedXref(String contextPath,
718791 }
719792
720793 /**
721- * Get the genre of a file.
794+ * Get the genre of a file, with a check for Huge Text .
722795 *
723796 * @param file The file to inspect
797+ * @param fileName name of the file to inspect
724798 * @return The genre suitable to decide how to display the file
725799 */
726- public static AbstractAnalyzer .Genre getGenre (String file ) {
727- return getGenre (find (file ));
800+ public static AbstractAnalyzer .Genre getGenre (File file , String fileName ) {
801+ try {
802+ return getGenre (getAnalyzerFactory (file , fileName , true ));
803+ } catch (IOException e ) {
804+ LOGGER .log (Level .WARNING , "Error reading {0}" , fileName );
805+ return null ;
806+ }
728807 }
729808
730809 /**
731- * Get the genre of a bulk of data.
810+ * Get the genre of a bulk of data, but without a check for Huge Text since
811+ * the file size is not available.
732812 *
733813 * @param in A stream containing the data
814+ * @param fileName name of the file to inspect
734815 * @return The genre suitable to decide how to display the file
735- * @throws java.io.IOException If an error occurs while getting the content
736816 */
737- public static AbstractAnalyzer .Genre getGenre (InputStream in ) throws IOException {
738- return getGenre (find (in ));
817+ public static AbstractAnalyzer .Genre getGenre (InputStream in , String fileName ) {
818+ try {
819+ return getGenre (getAnalyzerFactory (in , fileName ));
820+ } catch (IOException e ) {
821+ LOGGER .log (Level .WARNING , "Error reading {0}" , fileName );
822+ return null ;
823+ }
739824 }
740825
741826 /**
@@ -881,31 +966,36 @@ private static AnalyzerFactory findFactory(Class<?> factoryClass)
881966 *
882967 *
883968 * @param in The input stream containing the data
884- * @param file The file name to get the analyzer for
969+ * @param fileName The file name to get the analyzer for
885970 * @return the analyzer factory to use
886971 * @throws java.io.IOException If a problem occurs while reading the data
887972 */
888- public static AnalyzerFactory find (InputStream in , String file )
889- throws IOException {
890- AnalyzerFactory factory = find (file );
973+ static AnalyzerFactory find (InputStream in , String fileName ) throws IOException {
974+ AnalyzerFactory factory = find (fileName );
891975 // TODO above is not that great, since if 2 analyzers share one extension
892976 // then only the first one registered will own it
893977 // it would be cool if above could return more analyzers and below would
894978 // then decide between them ...
895979 if (factory != null ) {
896980 return factory ;
897981 }
898- return findForStream (in , file );
982+ return findForStream (in , fileName );
899983 }
900984
901985 /**
902- * Finds a suitable analyser class for file name.
986+ * Finds a suitable analyser class for {@code fileName}, which should only
987+ * be used in rare situations, such as for a JAR member or when content is
988+ * not available to support a full determination.
989+ * <p>To clarify, a full determination as done by
990+ * {@link #getAnalyzerFactory(File, String, boolean)} also reads a bit of
991+ * content as well as inspects file length to determine the ultimate
992+ * analyser.
903993 *
904- * @param file The file name to get the analyzer for
994+ * @param fileName The file name to get the analyzer for
905995 * @return the analyzer factory to use
906996 */
907- public static AnalyzerFactory find (String file ) {
908- String path = file ;
997+ public static AnalyzerFactory find (String fileName ) {
998+ String path = fileName ;
909999 int i ;
9101000
9111001 // Get basename of the file first.
@@ -924,8 +1014,7 @@ public static AnalyzerFactory find(String file) {
9241014 if (factory != null ) {
9251015 if (LOGGER .isLoggable (Level .FINEST )) {
9261016 LOGGER .log (Level .FINEST , "{0}: chosen by prefix: {1}" ,
927- new Object []{file ,
928- factory .getClass ().getSimpleName () });
1017+ new Object []{fileName , factory .getClass ().getSimpleName ()});
9291018 }
9301019 return factory ;
9311020 }
@@ -938,8 +1027,7 @@ public static AnalyzerFactory find(String file) {
9381027 if (factory != null ) {
9391028 if (LOGGER .isLoggable (Level .FINEST )) {
9401029 LOGGER .log (Level .FINEST , "{0}: chosen by suffix: {1}" ,
941- new Object []{file ,
942- factory .getClass ().getSimpleName () });
1030+ new Object []{fileName , factory .getClass ().getSimpleName ()});
9431031 }
9441032 return factory ;
9451033 }
@@ -957,22 +1045,22 @@ public static AnalyzerFactory find(String file) {
9571045 * @throws java.io.IOException if an error occurs while reading data from
9581046 * the stream
9591047 */
960- public static AnalyzerFactory find (InputStream in ) throws IOException {
961- return findForStream (in , "<anonymous>" );
1048+ static AnalyzerFactory find (InputStream in ) throws IOException {
1049+ return findForStream (in , ANONYMOUS_NAME );
9621050 }
9631051
9641052 /**
9651053 * Finds a suitable analyzer class for the data in this stream
9661054 * corresponding to a file of the specified name.
9671055 *
9681056 * @param in The stream containing the data to analyze
969- * @param file The file name to get the analyzer for
1057+ * @param fileName The file name to get the analyzer for
9701058 * @return the analyzer factory to use
9711059 * @throws java.io.IOException if an error occurs while reading data from
9721060 * the stream
9731061 */
974- private static AnalyzerFactory findForStream (InputStream in ,
975- String file ) throws IOException {
1062+ private static AnalyzerFactory findForStream (InputStream in , String fileName )
1063+ throws IOException {
9761064
9771065 in .mark (MAGIC_BYTES_NUM );
9781066 byte [] content = new byte [MAGIC_BYTES_NUM ];
@@ -998,8 +1086,8 @@ private static AnalyzerFactory findForStream(InputStream in,
9981086 if (fac != null ) {
9991087 if (LOGGER .isLoggable (Level .FINEST )) {
10001088 LOGGER .log (Level .FINEST ,
1001- "{0}: chosen by precise magic: {1}" , new Object []{
1002- file , fac .getClass ().getSimpleName () });
1089+ "{0}: chosen by precise magic: {1}" ,
1090+ new Object []{ fileName , fac .getClass ().getSimpleName ()});
10031091 }
10041092 return fac ;
10051093 }
@@ -1008,7 +1096,7 @@ private static AnalyzerFactory findForStream(InputStream in,
10081096
10091097 // Next, look for magic strings
10101098 String opening = readOpening (in , content );
1011- fac = findMagicString (opening , file );
1099+ fac = findMagicString (opening , fileName );
10121100 if (fac != null ) {
10131101 return fac ;
10141102 }
@@ -1020,9 +1108,8 @@ private static AnalyzerFactory findForStream(InputStream in,
10201108 if (fac != null ) {
10211109 if (LOGGER .isLoggable (Level .FINEST )) {
10221110 LOGGER .log (Level .FINEST ,
1023- "{0}: chosen by imprecise magic: {1}" ,
1024- new Object []{file ,
1025- fac .getClass ().getSimpleName () });
1111+ "{0}: chosen by imprecise magic: {1}" ,
1112+ new Object []{fileName , fac .getClass ().getSimpleName ()});
10261113 }
10271114 return fac ;
10281115 }
@@ -1032,16 +1119,15 @@ private static AnalyzerFactory findForStream(InputStream in,
10321119 return null ;
10331120 }
10341121
1035- private static AnalyzerFactory findMagicString (String opening , String file ) {
1122+ private static AnalyzerFactory findMagicString (String opening , String fileName ) {
10361123
10371124 // first, try to look up two words in magics
10381125 String fragment = getWords (opening , 2 );
10391126 AnalyzerFactory fac = magics .get (fragment );
10401127 if (fac != null ) {
10411128 if (LOGGER .isLoggable (Level .FINEST )) {
10421129 LOGGER .log (Level .FINEST , "{0}: chosen by magic {2}: {1}" ,
1043- new Object []{file , fac .getClass ().getSimpleName (),
1044- fragment });
1130+ new Object []{fileName , fac .getClass ().getSimpleName (), fragment });
10451131 }
10461132 return fac ;
10471133 }
@@ -1052,8 +1138,7 @@ private static AnalyzerFactory findMagicString(String opening, String file) {
10521138 if (fac != null ) {
10531139 if (LOGGER .isLoggable (Level .FINEST )) {
10541140 LOGGER .log (Level .FINEST , "{0}: chosen by magic {2}: {1}" ,
1055- new Object []{file , fac .getClass ().getSimpleName (),
1056- fragment });
1141+ new Object []{fileName , fac .getClass ().getSimpleName (), fragment });
10571142 }
10581143 return fac ;
10591144 }
@@ -1066,8 +1151,8 @@ private static AnalyzerFactory findMagicString(String opening, String file) {
10661151 fac = entry .getValue ();
10671152 if (LOGGER .isLoggable (Level .FINEST )) {
10681153 LOGGER .log (Level .FINEST ,
1069- "{0}: chosen by magic(substr) {2}: {1}" , new Object []{
1070- file , fac .getClass ().getSimpleName (), magic });
1154+ "{0}: chosen by magic(substr) {2}: {1}" ,
1155+ new Object []{ fileName , fac .getClass ().getSimpleName (), magic });
10711156 }
10721157 return fac ;
10731158 }
0 commit comments