Skip to content

Commit 1b0a034

Browse files
authored
Merge pull request #264 from MITLibraries/tco-163-term-features
Add term features and counts to GraphQL and models
2 parents 806c473 + 037be3d commit 1b0a034

File tree

15 files changed

+362
-43
lines changed

15 files changed

+362
-43
lines changed

app/graphql/types/counts_type.rb

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# frozen_string_literal: true
2+
3+
module Types
4+
class CountsType < Types::BaseObject
5+
description 'Features extracted from the input term that are counts. Useful for machine learning.'
6+
7+
field :apa_volume_issue, Integer, null: true, description: 'Count of apa volume number pattern in the input'
8+
field :brackets, Integer, null: true, description: 'Count of brackets in the input'
9+
field :characters, Integer, null: true, description: 'Count of characters in the input'
10+
field :colons, Integer, null: true, description: 'Count of colons in the input'
11+
field :commas, Integer, null: true, description: 'Count of commas in the input'
12+
field :lastnames, Integer, null: true, description: 'Count of lastnames in the input. Not recommended for use in production.'
13+
field :no, Integer, null: true, description: 'Count of `no` in the input'
14+
field :pages, Integer, null: true, description: 'Count of `pages` in the input'
15+
field :periods, Integer, null: true, description: 'Count of Periods in the input'
16+
field :pp, Integer, null: true, description: 'Count of `pp` in the input'
17+
field :quotes, Integer, null: true, description: 'Count of &quot in the input'
18+
field :semicolons, Integer, null: true, description: 'Count of Semicolons in the input'
19+
field :vol, Integer, null: true, description: 'Count of `vol` in the input'
20+
field :words, Integer, null: true, description: 'Count of Words in the input'
21+
field :year_parens, Integer, null: true, description: 'Count of (year) in the input'
22+
end
23+
end

app/graphql/types/features_type.rb

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# frozen_string_literal: true
2+
3+
module Types
4+
class FeaturesType < Types::BaseObject
5+
description 'Features extracted from the input term. Useful for machine learning.'
6+
7+
field :barcode, String, null: true
8+
field :counts, CountsType, null: true
9+
field :doi, String, null: true
10+
field :isbn, String, null: true
11+
field :issn, String, null: true
12+
field :journal, String, null: true
13+
field :ml_citation, Boolean, null: true
14+
field :pmid, String, null: true
15+
end
16+
end

app/graphql/types/search_event_type.rb

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,17 @@ class SearchEventType < Types::BaseObject
55
field :categories, [Types::CategoriesType], description: 'The list of categories linked to term provided in this search'
66
field :created_at, GraphQL::Types::ISO8601DateTime, null: false
77
field :detectors, Types::DetectorsType
8+
field :features, FeaturesType, null: true
89
field :id, ID, null: false
910
field :phrase, String
1011
field :source, String
1112
field :term_id, Integer
1213
field :updated_at, GraphQL::Types::ISO8601DateTime, null: false
1314

15+
def features
16+
@object.term.features
17+
end
18+
1419
def phrase
1520
@object.term.phrase
1621
end

app/graphql/types/term_type.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ class TermType < Types::BaseObject
55
field :categories, [Types::CategoriesType], description: 'The list of categories linked to this term'
66
field :created_at, GraphQL::Types::ISO8601DateTime, null: false
77
field :detectors, Types::DetectorsType
8+
field :features, FeaturesType, null: true
89
field :id, ID, null: false
910
field :occurence_count, Integer
1011
field :phrase, String, null: false

app/models/detector/citation.rb

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -85,19 +85,22 @@ def detections
8585

8686
# The record method first runs all of the parsers by running the initialize method. If the resulting score is higher
8787
# than the REQUIRED_SCORE value, then a Detection is registered.
88+
#
8889
# @param term [Term]
89-
# @return nil
90+
#
91+
# @return [Hash] a hash of features extracted from the Term
9092
def self.record(term)
9193
cit = Detector::Citation.new(term.phrase)
92-
return unless cit.detection?
9394

94-
Detection.find_or_create_by(
95-
term:,
96-
detector: Detector.where(name: 'Citation').first,
97-
detector_version: ENV.fetch('DETECTOR_VERSION', 'unset')
98-
)
95+
if cit.detection?
96+
Detection.find_or_create_by(
97+
term:,
98+
detector: Detector.where(name: 'Citation').first,
99+
detector_version: ENV.fetch('DETECTOR_VERSION', 'unset')
100+
)
101+
end
99102

100-
nil
103+
cit.features
101104
end
102105

103106
private

app/models/detector/journal.rb

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -45,18 +45,19 @@ def self.partial_term_match(phrase)
4545
# @note This does not care whether multiple matching journals are detected. If _any_ match is found, a Detection
4646
# record is created. The uniqueness constraint on Detection records would make multiple detections irrelevant.
4747
#
48-
# @return nil
48+
# @return [Set of Journal] A set of ActiveRecord Journal records.
4949
def self.record(term)
5050
result = full_term_match(term.phrase)
51-
return unless result.any?
5251

53-
Detection.find_or_create_by(
54-
term:,
55-
detector: Detector.where(name: 'Journal').first,
56-
detector_version: ENV.fetch('DETECTOR_VERSION', 'unset')
57-
)
52+
if result.any?
53+
Detection.find_or_create_by(
54+
term:,
55+
detector: Detector.where(name: 'Journal').first,
56+
detector_version: ENV.fetch('DETECTOR_VERSION', 'unset')
57+
)
58+
end
5859

59-
nil
60+
result
6061
end
6162
end
6263
end

app/models/detector/ml_citation.rb

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -38,20 +38,22 @@ def self.expected_env?
3838
# If a positive result is received, a Detection is registered.
3939
#
4040
# @param term [Term]
41-
# @return nil
41+
# @return boolean
4242
def self.record(term)
4343
result = Detector::MlCitation.new(term.phrase)
44-
return unless result.detection?
45-
46-
# Detections are registered to the "MlCitation" detector for now, but may end up replacing the "Citation" detector
47-
# in a future step.
48-
Detection.find_or_create_by(
49-
term:,
50-
detector: Detector.where(name: 'MlCitation').first,
51-
detector_version: ENV.fetch('DETECTOR_VERSION', 'unset')
52-
)
5344

54-
nil
45+
if result.detection?
46+
47+
# Detections are registered to the "MlCitation" detector for now, but may end up replacing the "Citation" detector
48+
# in a future step.
49+
Detection.find_or_create_by(
50+
term:,
51+
detector: Detector.where(name: 'MlCitation').first,
52+
detector_version: ENV.fetch('DETECTOR_VERSION', 'unset')
53+
)
54+
end
55+
56+
result.detections
5557
end
5658

5759
# lambda_path reads and returns the value of one environment variable.

app/models/detector/standard_identifiers.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def initialize(phrase)
3232
# a separate Detection record (although a single check finding multiple matches would still only result in one
3333
# Detection for that check).
3434
#
35-
# @return nil
35+
# @return [Hash] a hash of standard identifiers extracted from the Term
3636
def self.record(term)
3737
si = Detector::StandardIdentifiers.new(term.phrase)
3838

@@ -44,7 +44,7 @@ def self.record(term)
4444
)
4545
end
4646

47-
nil
47+
si.detections
4848
end
4949

5050
private

app/models/term.rb

Lines changed: 47 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,16 @@ def cluster
4848
# The record_detections method is the one-stop method to call every Detector's record method that is defined within
4949
# the application.
5050
#
51+
# @note This method is called by the calculate_categorizations method, so it is not necessary to call it separately.
52+
# It is also called by the features method to ensure that the latest detections are always available if
53+
# calculate_categorizations is not called first.
54+
#
5155
# @return nil
5256
def record_detections
53-
Detector::MlCitation.record(self) if Detector::MlCitation.expected_env?
54-
Detector::Citation.record(self)
55-
Detector::StandardIdentifiers.record(self)
56-
Detector::Journal.record(self)
57+
@ml_citation = Detector::MlCitation.record(self) if Detector::MlCitation.expected_env?
58+
@citation_summary = Detector::Citation.record(self)
59+
@std_identifiers = Detector::StandardIdentifiers.record(self)
60+
@journal = Detector::Journal.record(self)
5761
Detector::Lcsh.record(self)
5862
@suggested_resource_category = Detector::SuggestedResource.record(self)
5963
@suggested_pattern_category = Detector::SuggestedResourcePattern.record(self)
@@ -91,6 +95,45 @@ def calculate_categorizations
9195
end
9296
end
9397

98+
# Extracted features from various detectors are returned as a hash. This method is used to provide a summary of
99+
# the Term's features, which are used for machine learning and other purposes.
100+
#
101+
# @return [Hash] a hash of features extracted from the Term
102+
#
103+
# @note This method will call record_detections if the @citation_summary is not already populated.
104+
# This is to ensure that the features are always up-to-date with the latest detections.
105+
def features
106+
record_detections if @citation_summary.blank?
107+
{
108+
counts: {
109+
apa_volume_issue: @citation_summary[:apa_volume_issue] || 0,
110+
vol: @citation_summary[:vol] || 0,
111+
no: @citation_summary[:no] || 0,
112+
pages: @citation_summary[:pages] || 0,
113+
pp: @citation_summary[:pp] || 0,
114+
year_parens: @citation_summary[:year_parens] || 0,
115+
brackets: @citation_summary[:brackets] || 0,
116+
lastnames: @citation_summary[:lastnames] || 0,
117+
118+
characters: @citation_summary[:characters] || 0,
119+
colons: @citation_summary[:colons] || 0,
120+
commas: @citation_summary[:commas] || 0,
121+
quotes: @citation_summary[:quotes] || 0,
122+
periods: @citation_summary[:periods] || 0,
123+
semicolons: @citation_summary[:semicolons] || 0,
124+
words: @citation_summary[:words] || 0
125+
},
126+
127+
barcode: @std_identifiers[:barcode],
128+
doi: @std_identifiers[:doi],
129+
isbn: @std_identifiers[:isbn],
130+
issn: @std_identifiers[:issn],
131+
pmid: @std_identifiers[:pmid],
132+
journal: @journal&.first&.name,
133+
ml_citation: @ml_citation
134+
}
135+
end
136+
94137
private
95138

96139
# register_fingerprint method gets called before a Term record is saved, ensuring that Terms should always have a

config/environments/development.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@
7878

7979
# Local logging overrides
8080
config.logger = Logger.new(STDOUT)
81-
config.log_level = :debug
81+
config.log_level = :info
8282

8383
# Apply autocorrection by RuboCop to files generated by `bin/rails generate`.
8484
# config.generators.apply_rubocop_autocorrect_after_generate!

0 commit comments

Comments
 (0)