Skip to content

Commit b4cb5d0

Browse files
Pollepsjoepio
authored andcommitted
#226 Add parent filter to search
1 parent 019ed62 commit b4cb5d0

File tree

5 files changed

+278
-107
lines changed

5 files changed

+278
-107
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ Changes to JS assets are not included here, but in [`atomic-data-browser`'s CHAN
66

77
## Unreleased
88

9+
- Add parent parameter to search endpoint which scopes a search to only the decendants of the given resource.
910
- Bookmark endpoint now also retrieves `og:image` and `og:description` #510
1011

1112
## [v0.33.1] - 2022-09-25
@@ -291,7 +292,7 @@ You can still build from source by cloning the repo.
291292

292293
## [v0.23.3]
293294

294-
- Added import / export to server and lib #121
295+
- Added import / export to server and lib #121
295296
- Added basic cli functionality with Clap to server #125
296297
- Added multi-resource JSON-AD array parsing #123
297298
- Use JSON-LD as default store #79

lib/src/resources.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,19 @@ impl Resource {
127127
}
128128
}
129129

130+
/// Walks the parent tree upwards until there is no parent, then returns them as a vector.
131+
pub fn get_parent_tree(&self, store: &impl Storelike) -> AtomicResult<Vec<Resource>> {
132+
let mut parents: Vec<Resource> = Vec::new();
133+
let mut current = self.clone();
134+
135+
while let Ok(parent) = current.get_parent(store) {
136+
parents.push(parent.clone());
137+
current = parent;
138+
}
139+
140+
Ok(parents)
141+
}
142+
130143
/// Returns all PropVals.
131144
/// Useful if you want to iterate over all Atoms / Properties.
132145
pub fn get_propvals(&self) -> &PropVals {

server/src/commit_monitor.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ impl CommitMonitor {
122122
crate::search::remove_resource(&self.search_state, &target)?;
123123
};
124124
// Add new resource to search index
125-
crate::search::add_resource(&self.search_state, resource)?;
125+
crate::search::add_resource(&self.search_state, resource, &self.store)?;
126126
self.run_expensive_next_tick = true;
127127
} else {
128128
// If there is no new resource, it must have been deleted, so let's remove it from the search index.

server/src/handlers/search.rs

Lines changed: 157 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,15 @@
44
//! Tantivy requires a strict schema, whereas Atomic is dynamic.
55
//! We deal with this discrepency by
66
7-
use crate::{appstate::AppState, errors::AtomicServerResult};
7+
use std::collections::HashSet;
8+
9+
use crate::{
10+
appstate::AppState,
11+
errors::{AtomicServerError, AtomicServerResult},
12+
search::{resource_to_facet, Fields},
13+
};
814
use actix_web::{web, HttpResponse};
9-
use atomic_lib::{urls, Resource, Storelike};
15+
use atomic_lib::{errors::AtomicResult, urls, Db, Resource, Storelike};
1016
use serde::Deserialize;
1117
use tantivy::{collector::TopDocs, query::QueryParser};
1218

@@ -20,6 +26,8 @@ pub struct SearchQuery {
2026
pub limit: Option<usize>,
2127
/// Filter by Property URL
2228
pub property: Option<String>,
29+
/// Only include resources that have this resource as its ancestor
30+
pub parent: Option<String>,
2331
}
2432

2533
/// Parses a search query and responds with a list of resources
@@ -43,110 +51,45 @@ pub async fn search_query(
4351
default_limit
4452
};
4553

46-
let mut should_fuzzy = true;
47-
if params.property.is_some() {
48-
// Fuzzy searching is not possible when filtering by property
49-
should_fuzzy = false;
50-
}
54+
// With this first limit, we go for a greater number - as the user may not have the rights to the first ones!
55+
// We filter these results later.
56+
// https://github.com/atomicdata-dev/atomic-data-rust/issues/279.
57+
let initial_results_limit = 100;
5158

52-
let mut subjects: Vec<String> = Vec::new();
53-
// These are not used at this moment, but would be quite useful in RDF context.
54-
let mut atoms: Vec<StringAtom> = Vec::new();
59+
let mut query_list: Vec<(tantivy::query::Occur, Box<dyn tantivy::query::Query>)> = Vec::new();
60+
61+
if let Some(parent) = params.parent.clone() {
62+
let query = build_parent_query(parent, &fields, store)?;
63+
64+
query_list.push((tantivy::query::Occur::Must, Box::new(query)));
65+
}
5566

5667
if let Some(q) = params.q.clone() {
57-
// If any of these substrings appear, the user wants an exact / advanced search
58-
let dont_fuzz_strings = vec!["*", "AND", "OR", "[", "\"", ":", "+", "-", " "];
59-
for substr in dont_fuzz_strings {
60-
if q.contains(substr) {
61-
should_fuzzy = false
62-
}
63-
}
68+
let fuzzy = should_fuzzy(&params.property, &q);
6469

65-
let query: Box<dyn tantivy::query::Query> = if should_fuzzy {
66-
let term = tantivy::Term::from_field_text(fields.value, &q);
67-
let query = tantivy::query::FuzzyTermQuery::new_prefix(term, 1, true);
68-
Box::new(query)
70+
let query = if fuzzy {
71+
build_fuzzy_query(&fields, &q)?
6972
} else {
70-
// construct the query
71-
let query_parser = QueryParser::for_index(
73+
build_query(
74+
&fields,
75+
&q,
76+
params.property.clone(),
7277
&appstate.search_state.index,
73-
vec![
74-
fields.subject,
75-
// I don't think we need to search in the property
76-
// fields.property,
77-
fields.value,
78-
],
79-
);
80-
let full_query = if let Some(prop) = &params.property {
81-
format!("property:{:?} AND {}", prop, &q)
82-
} else {
83-
q
84-
};
85-
query_parser
86-
.parse_query(&full_query)
87-
.map_err(|e| format!("Error parsing query {}", e))?
78+
)?
8879
};
8980

90-
// With this first limit, we go for a greater number - as the user may not have the rights to the first ones!
91-
// We filter these results later.
92-
// https://github.com/atomicdata-dev/atomic-data-rust/issues/279.
93-
let initial_results_limit = 100;
94-
95-
// execute the query
96-
let top_docs = searcher
97-
.search(&query, &TopDocs::with_limit(initial_results_limit))
98-
.map_err(|e| format!("Error with creating search results: {} ", e))?;
99-
100-
// convert found documents to resources
101-
for (_score, doc_address) in top_docs {
102-
let retrieved_doc = searcher.doc(doc_address)?;
103-
let subject_val = retrieved_doc.get_first(fields.subject).ok_or("No 'subject' in search doc found. This is required when indexing. Run with --rebuild-index")?;
104-
let prop_val = retrieved_doc.get_first(fields.property).ok_or("No 'property' in search doc found. This is required when indexing. Run with --rebuild-index")?;
105-
let value_val = retrieved_doc.get_first(fields.value).ok_or("No 'value' in search doc found. This is required when indexing. Run with --rebuild-index")?;
106-
let subject = match subject_val {
107-
tantivy::schema::Value::Str(s) => s.to_string(),
108-
_else => {
109-
return Err(format!(
110-
"Search schema error: Subject is not a string! Doc: {:?}",
111-
retrieved_doc
112-
)
113-
.into())
114-
}
115-
};
116-
let property = match prop_val {
117-
tantivy::schema::Value::Str(s) => s.to_string(),
118-
_else => {
119-
return Err(format!(
120-
"Search schema error: Property is not a string! Doc: {:?}",
121-
retrieved_doc
122-
)
123-
.into())
124-
}
125-
};
126-
let value = match value_val {
127-
tantivy::schema::Value::Str(s) => s.to_string(),
128-
_else => {
129-
return Err(format!(
130-
"Search schema error: Value is not a string! Doc: {:?}",
131-
retrieved_doc
132-
)
133-
.into())
134-
}
135-
};
136-
if subjects.contains(&subject) {
137-
continue;
138-
} else {
139-
subjects.push(subject.clone());
140-
let atom = StringAtom {
141-
subject,
142-
property,
143-
value,
144-
};
145-
atoms.push(atom);
146-
}
147-
}
81+
query_list.push((tantivy::query::Occur::Must, query));
14882
}
14983

84+
let query = tantivy::query::BooleanQuery::new(query_list);
85+
86+
// execute the query
87+
let top_docs = searcher
88+
.search(&query, &TopDocs::with_limit(initial_results_limit))
89+
.map_err(|e| format!("Error with creating search results: {} ", e))?;
90+
91+
let (subjects, _atoms) = docs_to_resources(top_docs, &fields, &searcher)?;
92+
15093
// Create a valid atomic data resource.
15194
// You'd think there would be a simpler way of getting the requested URL...
15295
let subject = format!(
@@ -213,7 +156,7 @@ pub async fn search_index_rdf(
213156
get_inner_value(t.object),
214157
) {
215158
(Some(s), Some(p), Some(o)) => {
216-
crate::search::add_triple(&writer, s, p, o, &fields).ok();
159+
crate::search::add_triple(&writer, s, p, o, None, &fields).ok();
217160
}
218161
_ => return Ok(()),
219162
};
@@ -244,9 +187,125 @@ fn get_inner_value(t: Term) -> Option<String> {
244187
}
245188
}
246189

247-
#[derive(Debug)]
190+
#[derive(Debug, std::hash::Hash, Eq, PartialEq)]
248191
pub struct StringAtom {
249192
pub subject: String,
250193
pub property: String,
251194
pub value: String,
252195
}
196+
197+
fn should_fuzzy(property: &Option<String>, q: &str) -> bool {
198+
if property.is_some() {
199+
// Fuzzy searching is not possible when filtering by property
200+
return false;
201+
}
202+
203+
// If any of these substrings appear, the user wants an exact / advanced search
204+
let dont_fuzz_strings = vec!["*", "AND", "OR", "[", "\"", ":", "+", "-", " "];
205+
for substr in dont_fuzz_strings {
206+
if q.contains(substr) {
207+
return false;
208+
}
209+
}
210+
211+
true
212+
}
213+
214+
fn build_fuzzy_query(fields: &Fields, q: &str) -> AtomicResult<Box<dyn tantivy::query::Query>> {
215+
let term = tantivy::Term::from_field_text(fields.value, q);
216+
let query = tantivy::query::FuzzyTermQuery::new_prefix(term, 1, true);
217+
218+
Ok(Box::new(query))
219+
}
220+
221+
fn build_query(
222+
fields: &Fields,
223+
q: &str,
224+
property: Option<String>,
225+
index: &tantivy::Index,
226+
) -> AtomicResult<Box<dyn tantivy::query::Query>> {
227+
// construct the query
228+
let query_parser = QueryParser::for_index(
229+
index,
230+
vec![
231+
fields.subject,
232+
// I don't think we need to search in the property
233+
// fields.property,
234+
fields.value,
235+
],
236+
);
237+
238+
let query_text = if let Some(prop) = property {
239+
format!("property:{:?} AND {}", prop, &q)
240+
} else {
241+
q.to_string()
242+
};
243+
244+
let query = query_parser
245+
.parse_query(&query_text)
246+
.map_err(|e| format!("Error parsing query {}", e))?;
247+
248+
Ok(query)
249+
}
250+
251+
fn build_parent_query(
252+
subject: String,
253+
fields: &Fields,
254+
store: &Db,
255+
) -> AtomicServerResult<tantivy::query::TermQuery> {
256+
let resource = store.get_resource(subject.as_str())?;
257+
let facet = resource_to_facet(&resource, store)?;
258+
259+
let term = tantivy::Term::from_facet(fields.hierarchy, &facet);
260+
261+
Ok(tantivy::query::TermQuery::new(
262+
term,
263+
tantivy::schema::IndexRecordOption::Basic,
264+
))
265+
}
266+
267+
fn unpack_value(
268+
value: &tantivy::schema::Value,
269+
document: &tantivy::Document,
270+
name: String,
271+
) -> Result<String, AtomicServerError> {
272+
match value {
273+
tantivy::schema::Value::Str(s) => Ok(s.to_string()),
274+
_else => Err(format!(
275+
"Search schema error: {} is not a string! Doc: {:?}",
276+
name, document
277+
)
278+
.into()),
279+
}
280+
}
281+
282+
fn docs_to_resources(
283+
docs: Vec<(f32, tantivy::DocAddress)>,
284+
fields: &Fields,
285+
searcher: &tantivy::LeasedItem<tantivy::Searcher>,
286+
) -> Result<(Vec<String>, Vec<StringAtom>), AtomicServerError> {
287+
let mut subjects: HashSet<String> = HashSet::new();
288+
// These are not used at this moment, but would be quite useful in RDF context.
289+
let mut atoms: HashSet<StringAtom> = HashSet::new();
290+
291+
// convert found documents to resources
292+
for (_score, doc_address) in docs {
293+
let retrieved_doc = searcher.doc(doc_address)?;
294+
let subject_val = retrieved_doc.get_first(fields.subject).ok_or("No 'subject' in search doc found. This is required when indexing. Run with --rebuild-index")?;
295+
let prop_val = retrieved_doc.get_first(fields.property).ok_or("No 'property' in search doc found. This is required when indexing. Run with --rebuild-index")?;
296+
let value_val = retrieved_doc.get_first(fields.value).ok_or("No 'value' in search doc found. This is required when indexing. Run with --rebuild-index")?;
297+
298+
let subject = unpack_value(subject_val, &retrieved_doc, "Subject".to_string())?;
299+
let property = unpack_value(prop_val, &retrieved_doc, "Property".to_string())?;
300+
let value = unpack_value(value_val, &retrieved_doc, "Value".to_string())?;
301+
302+
subjects.insert(subject.clone());
303+
atoms.insert(StringAtom {
304+
subject,
305+
property,
306+
value,
307+
});
308+
}
309+
310+
Ok((subjects.into_iter().collect(), atoms.into_iter().collect()))
311+
}

0 commit comments

Comments
 (0)