Merge pull request #266 from MITLibraries/tco-190-filter

matt-bernhardt · web-flow · commit cf35392e40ad · 2025-08-01T09:59:50.000-04:00
Filter searches before citation detector
diff --git a/Gemfile b/Gemfile
@@ -128,6 +128,7 @@ group :test do
   # Use system testing [https://guides.rubyonrails.org/testing.html#system-testing]
   gem 'capybara'
   gem 'climate_control'
+  gem 'mocha'
   gem 'selenium-webdriver'
   gem 'simplecov'
   gem 'simplecov-lcov'
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -237,6 +237,8 @@ GEM
     matrix (0.4.2)
     mini_mime (1.1.5)
     minitest (5.25.5)
+    mocha (2.7.1)
+      ruby2_keywords (>= 0.0.5)
     msgpack (1.8.0)
     multi_json (1.15.0)
     net-http (0.6.0)
@@ -398,6 +400,7 @@ GEM
       rubocop (>= 1.75.0, < 2.0)
       rubocop-ast (>= 1.44.0, < 2.0)
     ruby-progressbar (1.13.0)
+    ruby2_keywords (0.0.5)
     rubyzip (2.4.1)
     sassc (2.4.0)
       ffi (~> 1.9)
@@ -518,6 +521,7 @@ DEPENDENCIES
   importmap-rails
   jbuilder
   mitlibraries-theme!
+  mocha
   omniauth
   omniauth-rails_csrf_protection
   omniauth_openid_connect
diff --git a/app/models/detector/ml_citation.rb b/app/models/detector/ml_citation.rb
@@ -7,11 +7,16 @@ class MlCitation
     # For now the initialize method just needs to consult the external lambda.
     #
     #   @param phrase String. Often a `Term.phrase`.
-    #   @return Nothing intentional. Data is written to Hash `@detections` during processing.
+    #   @return Nothing intentional. Data is written to Boolean `@detections` during processing.
     def initialize(phrase)
       return unless self.class.expected_env?
 
-      response = fetch(phrase)
+      @detections = false
+
+      features = extract_features(phrase)
+      return unless enough_nonzero_values?(features)
+
+      response = fetch(features)
       @detections = response unless response == 'Error'
     end
 
@@ -111,10 +116,10 @@ def define_lambda
     # define_payload defines the Hash that will be sent to the lambda.
     #
     # @return Hash
-    def define_payload(phrase)
+    def define_payload(features)
       {
         action: 'predict',
-        features: extract_features(phrase),
+        features: features,
         challenge_secret: self.class.lambda_secret
       }
     end
@@ -135,9 +140,9 @@ def extract_features(phrase)
     # error handling with the response.
     #
     # @return Boolean or 'Error'
-    def fetch(phrase)
+    def fetch(features)
       lambda = define_lambda
-      payload = define_payload(phrase)
+      payload = define_payload(features)
 
       response = lambda.post(self.class.lambda_path, payload.to_json)
 
@@ -151,5 +156,18 @@ def fetch(phrase)
         'Error'
       end
     end
+
+    # Enough_nonzero_values? checks that a provided hash contains at least three values which are not zero.
+    #
+    # @note We chose 3 as our value here after analyzing the behavior of the citation detector across nearly a year of
+    #   search traffic. For searches which had only one or two features that are not zero, we found no actual citations.
+    #   To see the analyses, look at the "Filtering results" and "Surprising predictions" notebooks at
+    #   https://github.com/MITLibraries/tacos-notebooks/tree/main/notebooks/explorations
+    #
+    # @param hash Hash
+    # @return Integer
+    def enough_nonzero_values?(hash)
+      hash.values.count { |v| v != 0 } >= 3
+    end
   end
 end
diff --git a/test/models/detector/ml_citation_test.rb b/test/models/detector/ml_citation_test.rb
@@ -77,12 +77,32 @@ class MlCitationTest < ActiveSupport::TestCase
 
             assert_instance_of Detector::MlCitation, prediction
 
-            assert_nil(prediction.detections)
+            assert_equal(false, prediction.detections)
           end
         end
       end
     end
 
+    test 'lookup skips fetching a prediction for search phrases with less than three features' do
+      Detector::MlCitation.any_instance.expects(:fetch).never
+
+      with_enabled_mlcitation do
+        # This search phrase is expected to have only two non-zero feature values, which based on
+        # our analyses will never result in a predicted citation.
+        Detector::MlCitation.new('foobar (2025)')
+      end
+    end
+
+    test 'lookup does not skip fetching a prediction for search phrases with three or more features' do
+      Detector::MlCitation.any_instance.expects(:fetch).once
+
+      with_enabled_mlcitation do
+        # This search phrase is expected to have three non-zero feature values, which is the minimum
+        # number we expect to have any hope of a citation.
+        Detector::MlCitation.new('foobar (2025) 1234-76')
+      end
+    end
+
     # Record method
     test 'record does relevant work' do
       with_enabled_mlcitation do
diff --git a/test/test_helper.rb b/test/test_helper.rb
@@ -13,6 +13,7 @@
 ENV['RAILS_ENV'] ||= 'test'
 require_relative '../config/environment'
 require 'rails/test_help'
+require 'mocha/minitest'
 
 VCR.configure do |config|
   config.ignore_localhost = false
@@ -124,4 +125,4 @@ def with_enabled_mlcitation
   }
 ensure
   ENV.replace(old_env)
-end
+end