Skip to content

Commit cf35392

Browse files
Merge pull request #266 from MITLibraries/tco-190-filter
Filter searches before citation detector
2 parents 1b0a034 + fb47d97 commit cf35392

File tree

5 files changed

+52
-8
lines changed

5 files changed

+52
-8
lines changed

Gemfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ group :test do
128128
# Use system testing [https://guides.rubyonrails.org/testing.html#system-testing]
129129
gem 'capybara'
130130
gem 'climate_control'
131+
gem 'mocha'
131132
gem 'selenium-webdriver'
132133
gem 'simplecov'
133134
gem 'simplecov-lcov'

Gemfile.lock

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,8 @@ GEM
237237
matrix (0.4.2)
238238
mini_mime (1.1.5)
239239
minitest (5.25.5)
240+
mocha (2.7.1)
241+
ruby2_keywords (>= 0.0.5)
240242
msgpack (1.8.0)
241243
multi_json (1.15.0)
242244
net-http (0.6.0)
@@ -398,6 +400,7 @@ GEM
398400
rubocop (>= 1.75.0, < 2.0)
399401
rubocop-ast (>= 1.44.0, < 2.0)
400402
ruby-progressbar (1.13.0)
403+
ruby2_keywords (0.0.5)
401404
rubyzip (2.4.1)
402405
sassc (2.4.0)
403406
ffi (~> 1.9)
@@ -518,6 +521,7 @@ DEPENDENCIES
518521
importmap-rails
519522
jbuilder
520523
mitlibraries-theme!
524+
mocha
521525
omniauth
522526
omniauth-rails_csrf_protection
523527
omniauth_openid_connect

app/models/detector/ml_citation.rb

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,16 @@ class MlCitation
77
# For now the initialize method just needs to consult the external lambda.
88
#
99
# @param phrase String. Often a `Term.phrase`.
10-
# @return Nothing intentional. Data is written to Hash `@detections` during processing.
10+
# @return Nothing intentional. Data is written to Boolean `@detections` during processing.
1111
def initialize(phrase)
1212
return unless self.class.expected_env?
1313

14-
response = fetch(phrase)
14+
@detections = false
15+
16+
features = extract_features(phrase)
17+
return unless enough_nonzero_values?(features)
18+
19+
response = fetch(features)
1520
@detections = response unless response == 'Error'
1621
end
1722

@@ -111,10 +116,10 @@ def define_lambda
111116
# define_payload defines the Hash that will be sent to the lambda.
112117
#
113118
# @return Hash
114-
def define_payload(phrase)
119+
def define_payload(features)
115120
{
116121
action: 'predict',
117-
features: extract_features(phrase),
122+
features: features,
118123
challenge_secret: self.class.lambda_secret
119124
}
120125
end
@@ -135,9 +140,9 @@ def extract_features(phrase)
135140
# error handling with the response.
136141
#
137142
# @return Boolean or 'Error'
138-
def fetch(phrase)
143+
def fetch(features)
139144
lambda = define_lambda
140-
payload = define_payload(phrase)
145+
payload = define_payload(features)
141146

142147
response = lambda.post(self.class.lambda_path, payload.to_json)
143148

@@ -151,5 +156,18 @@ def fetch(phrase)
151156
'Error'
152157
end
153158
end
159+
160+
# Enough_nonzero_values? checks that a provided hash contains at least three values which are not zero.
161+
#
162+
# @note We chose 3 as our value here after analyzing the behavior of the citation detector across nearly a year of
163+
# search traffic. For searches which had only one or two features that are not zero, we found no actual citations.
164+
# To see the analyses, look at the "Filtering results" and "Surprising predictions" notebooks at
165+
# https://github.com/MITLibraries/tacos-notebooks/tree/main/notebooks/explorations
166+
#
167+
# @param hash Hash
168+
# @return Integer
169+
def enough_nonzero_values?(hash)
170+
hash.values.count { |v| v != 0 } >= 3
171+
end
154172
end
155173
end

test/models/detector/ml_citation_test.rb

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,12 +77,32 @@ class MlCitationTest < ActiveSupport::TestCase
7777

7878
assert_instance_of Detector::MlCitation, prediction
7979

80-
assert_nil(prediction.detections)
80+
assert_equal(false, prediction.detections)
8181
end
8282
end
8383
end
8484
end
8585

86+
test 'lookup skips fetching a prediction for search phrases with less than three features' do
87+
Detector::MlCitation.any_instance.expects(:fetch).never
88+
89+
with_enabled_mlcitation do
90+
# This search phrase is expected to have only two non-zero feature values, which based on
91+
# our analyses will never result in a predicted citation.
92+
Detector::MlCitation.new('foobar (2025)')
93+
end
94+
end
95+
96+
test 'lookup does not skip fetching a prediction for search phrases with three or more features' do
97+
Detector::MlCitation.any_instance.expects(:fetch).once
98+
99+
with_enabled_mlcitation do
100+
# This search phrase is expected to have three non-zero feature values, which is the minimum
101+
# number we expect to have any hope of a citation.
102+
Detector::MlCitation.new('foobar (2025) 1234-76')
103+
end
104+
end
105+
86106
# Record method
87107
test 'record does relevant work' do
88108
with_enabled_mlcitation do

test/test_helper.rb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
ENV['RAILS_ENV'] ||= 'test'
1414
require_relative '../config/environment'
1515
require 'rails/test_help'
16+
require 'mocha/minitest'
1617

1718
VCR.configure do |config|
1819
config.ignore_localhost = false
@@ -124,4 +125,4 @@ def with_enabled_mlcitation
124125
}
125126
ensure
126127
ENV.replace(old_env)
127-
end
128+
end

0 commit comments

Comments
 (0)