@@ -48,12 +48,16 @@ def cluster
4848 # The record_detections method is the one-stop method to call every Detector's record method that is defined within
4949 # the application.
5050 #
51+ # @note This method is called by the calculate_categorizations method, so it is not necessary to call it separately.
52+ # It is also called by the features method to ensure that the latest detections are always available if
53+ # calculate_categorizations is not called first.
54+ #
5155 # @return nil
5256 def record_detections
53- Detector ::MlCitation . record ( self ) if Detector ::MlCitation . expected_env?
54- Detector ::Citation . record ( self )
55- Detector ::StandardIdentifiers . record ( self )
56- Detector ::Journal . record ( self )
57+ @ml_citation = Detector ::MlCitation . record ( self ) if Detector ::MlCitation . expected_env?
58+ @citation_summary = Detector ::Citation . record ( self )
59+ @std_identifiers = Detector ::StandardIdentifiers . record ( self )
60+ @journal = Detector ::Journal . record ( self )
5761 Detector ::Lcsh . record ( self )
5862 @suggested_resource_category = Detector ::SuggestedResource . record ( self )
5963 @suggested_pattern_category = Detector ::SuggestedResourcePattern . record ( self )
@@ -91,6 +95,45 @@ def calculate_categorizations
9195 end
9296 end
9397
98+ # Extracted features from various detectors are returned as a hash. This method is used to provide a summary of
99+ # the Term's features, which are used for machine learning and other purposes.
100+ #
101+ # @return [Hash] a hash of features extracted from the Term
102+ #
103+ # @note This method will call record_detections if the @citation_summary is not already populated.
104+ # This is to ensure that the features are always up-to-date with the latest detections.
105+ def features
106+ record_detections if @citation_summary . blank?
107+ {
108+ counts : {
109+ apa_volume_issue : @citation_summary [ :apa_volume_issue ] || 0 ,
110+ vol : @citation_summary [ :vol ] || 0 ,
111+ no : @citation_summary [ :no ] || 0 ,
112+ pages : @citation_summary [ :pages ] || 0 ,
113+ pp : @citation_summary [ :pp ] || 0 ,
114+ year_parens : @citation_summary [ :year_parens ] || 0 ,
115+ brackets : @citation_summary [ :brackets ] || 0 ,
116+ lastnames : @citation_summary [ :lastnames ] || 0 ,
117+
118+ characters : @citation_summary [ :characters ] || 0 ,
119+ colons : @citation_summary [ :colons ] || 0 ,
120+ commas : @citation_summary [ :commas ] || 0 ,
121+ quotes : @citation_summary [ :quotes ] || 0 ,
122+ periods : @citation_summary [ :periods ] || 0 ,
123+ semicolons : @citation_summary [ :semicolons ] || 0 ,
124+ words : @citation_summary [ :words ] || 0
125+ } ,
126+
127+ barcode : @std_identifiers [ :barcode ] ,
128+ doi : @std_identifiers [ :doi ] ,
129+ isbn : @std_identifiers [ :isbn ] ,
130+ issn : @std_identifiers [ :issn ] ,
131+ pmid : @std_identifiers [ :pmid ] ,
132+ journal : @journal &.first &.name ,
133+ ml_citation : @ml_citation
134+ }
135+ end
136+
94137 private
95138
96139 # register_fingerprint method gets called before a Term record is saved, ensuring that Terms should always have a
0 commit comments