Skip to content

Commit

Permalink
GH-5149 Lucene numdocs param (#5163)
Browse files Browse the repository at this point in the history
  • Loading branch information
hmottestad authored Nov 20, 2024
2 parents f687e85 + 42a331d commit 08b898c
Show file tree
Hide file tree
Showing 11 changed files with 294 additions and 26 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Objects;
import java.util.Properties;
import java.util.Set;

Expand Down Expand Up @@ -577,10 +578,11 @@ protected Iterable<? extends DocumentScore> query(Resource subject, QuerySpec sp
}

SearchHits hits;
int numDocs = Objects.requireNonNullElse(spec.getNumDocs(), -1);
if (subject != null) {
hits = search(subject, request, qb);
hits = search(subject, request, qb, numDocs);
} else {
hits = search(request, qb);
hits = search(request, qb, numDocs);
}
return Iterables.transform(hits, new Function<>() {

Expand All @@ -600,11 +602,24 @@ public DocumentScore apply(SearchHit hit) {
* @return search hits
*/
public SearchHits search(Resource resource, SearchRequestBuilder request, QueryBuilder query) {
return search(resource, request, query, -1);
}

/**
* Evaluates the given query only for the given resource.
*
* @param resource
* @param request
* @param query
* @param numDocs
* @return search hits
*/
public SearchHits search(Resource resource, SearchRequestBuilder request, QueryBuilder query, int numDocs) {
// rewrite the query
QueryBuilder idQuery = QueryBuilders.termQuery(SearchFields.URI_FIELD_NAME,
SearchFields.getResourceID(resource));
QueryBuilder combinedQuery = QueryBuilders.boolQuery().must(idQuery).must(query);
return search(request, combinedQuery);
return search(request, combinedQuery, numDocs);
}

@Override
Expand Down Expand Up @@ -712,10 +727,23 @@ private ShapeRelation toSpatialOp(String relation) {
* Evaluates the given query and returns the results as a TopDocs instance.
*/
public SearchHits search(SearchRequestBuilder request, QueryBuilder query) {
return search(request, query, -1);
}

/**
* Evaluates the given query and returns the results as a TopDocs instance.
*/
public SearchHits search(SearchRequestBuilder request, QueryBuilder query, int numDocs) {
String[] types = getTypes();
int nDocs;
if (maxDocs > 0) {
nDocs = maxDocs;
if (numDocs > 0) {
if (maxDocs > 0 && maxDocs < numDocs) {
nDocs = maxDocs;
} else {
nDocs = numDocs;
}
} else if (defaultNumDocs > 0) {
nDocs = defaultNumDocs;
} else {
long docCount = client.prepareSearch(indexName)
.setTypes(types)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ public abstract class AbstractSearchIndex implements SearchIndex {
REJECTED_DATATYPES.add("http://www.w3.org/2001/XMLSchema#float");
}

protected int defaultNumDocs;
protected int maxDocs;

protected Set<String> wktFields = Collections.singleton(SearchFields.getPropertyField(GEO.AS_WKT));
Expand All @@ -75,8 +76,10 @@ public abstract class AbstractSearchIndex implements SearchIndex {

@Override
public void initialize(Properties parameters) throws Exception {
String maxDocParam = parameters.getProperty(LuceneSail.MAX_DOCUMENTS_KEY);
maxDocs = (maxDocParam != null) ? Integer.parseInt(maxDocParam) : -1;
String maxDocumentsParam = parameters.getProperty(LuceneSail.MAX_DOCUMENTS_KEY);
maxDocs = (maxDocumentsParam != null) ? Integer.parseInt(maxDocumentsParam) : -1;
String defaultNumDocsParam = parameters.getProperty(LuceneSail.DEFAULT_NUM_DOCS_KEY);
defaultNumDocs = (defaultNumDocsParam != null) ? Integer.parseInt(defaultNumDocsParam) : defaultNumDocs;

String wktFieldParam = parameters.getProperty(LuceneSail.WKT_FIELDS);
if (wktFieldParam != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -290,10 +290,17 @@ public class LuceneSail extends NotifyingSailWrapper {
public static final String LUCENE_RAMDIR_KEY = "useramdir";

/**
* Set the key "maxDocuments=&lt;n&gt;" as sail parameter to limit the maximum number of documents to return from a
* search query. The default is to return all documents. NB: this may involve extra cost for some SearchIndex
* Set the key "defaultNumDocs=&lt;n&gt;" as sail parameter to limit the maximum number of documents to return from
* a search query. The default is to return all documents. NB: this may involve extra cost for some SearchIndex
* implementations as they may have to determine this number.
*/
public static final String DEFAULT_NUM_DOCS_KEY = "defaultNumDocs";

/**
* Set the key "maxDocuments=&lt;n&gt;" as sail parameter to limit the maximum number of documents the user can
* query at a time to return from a search query. The default is the value of the {@link #DEFAULT_NUM_DOCS_KEY}
* parameter.
*/
public static final String MAX_DOCUMENTS_KEY = "maxDocuments";

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ public class LuceneSailSchema {

public static final IRI CONTEXT;

public static final IRI NUM_DOCS;

static {
ValueFactory factory = SimpleValueFactory.getInstance(); // compatible with beta4:
// creating a new factory
Expand All @@ -73,5 +75,6 @@ public class LuceneSailSchema {
WITHIN_DISTANCE = factory.createIRI(NAMESPACE + "withinDistance");
DISTANCE = factory.createIRI(NAMESPACE + "distance");
CONTEXT = factory.createIRI(NAMESPACE + "context");
NUM_DOCS = factory.createIRI(NAMESPACE + "numDocs");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
import java.util.stream.Collectors;

import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Literal;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.query.algebra.QueryModelNode;
import org.eclipse.rdf4j.query.algebra.SingletonSet;
import org.eclipse.rdf4j.query.algebra.StatementPattern;
Expand Down Expand Up @@ -67,21 +69,43 @@ private static void append(Var var, StringBuilder buffer) {

private final StatementPattern idPattern;

private final StatementPattern numDocsPattern;

private final Resource subject;

private final String matchesVarName;

private final String scoreVarName;

private final Integer numDocs;

public QuerySpec(StatementPattern matchesPattern, Collection<QueryParam> queryPatterns,
StatementPattern scorePattern, StatementPattern typePattern,
StatementPattern idPattern, Resource subject) {
this(matchesPattern, queryPatterns, scorePattern, typePattern, idPattern, null, subject);
}

public QuerySpec(StatementPattern matchesPattern, Collection<QueryParam> queryPatterns,
StatementPattern scorePattern, StatementPattern typePattern,
StatementPattern idPattern, StatementPattern numDocsPattern, Resource subject) {
this.matchesPattern = matchesPattern;
this.queryPatterns = queryPatterns;
this.scorePattern = scorePattern;
this.typePattern = typePattern;
this.idPattern = idPattern;
this.numDocsPattern = numDocsPattern;
this.subject = subject;
if (numDocsPattern != null) {
Value val = numDocsPattern.getObjectVar().getValue();
if (val != null && val.isLiteral()) {
this.numDocs = ((Literal) val).intValue();
} else {
throw new IllegalArgumentException("numDocs should be constant literal value");
}
} else {
this.numDocs = null;
}

if (matchesPattern != null) {
this.matchesVarName = matchesPattern.getSubjectVar().getName();
} else {
Expand All @@ -101,9 +125,11 @@ public QuerySpec(String matchesVarName, String propertyVarName, String scoreVarN
this.matchesPattern = null;
this.scorePattern = null;
this.typePattern = null;
this.numDocsPattern = null;
this.queryPatterns = Set.of();
this.idPattern = null;
this.subject = subject;
this.numDocs = null;
}

@Override
Expand All @@ -121,6 +147,7 @@ public QueryModelNode removeQueryPatterns() {
replace(getScorePattern(), replacement);
replace(getTypePattern(), replacement);
replace(getIdPattern(), replacement);
replace(getNumDocsPattern(), replacement);

final QueryModelNode placeholder = new SingletonSet();

Expand Down Expand Up @@ -154,6 +181,10 @@ public StatementPattern getScorePattern() {
return scorePattern;
}

public StatementPattern getNumDocsPattern() {
return numDocsPattern;
}

/**
* The variable name associated with the query score
*
Expand All @@ -163,6 +194,10 @@ public String getScoreVariableName() {
return scoreVarName;
}

public Integer getNumDocs() {
return numDocs;
}

public StatementPattern getTypePattern() {
return typePattern;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.INDEXID;
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.LUCENE_QUERY;
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.MATCHES;
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.NUM_DOCS;
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.PROPERTY;
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.QUERY;
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.SCORE;
Expand Down Expand Up @@ -152,7 +153,7 @@ public void process(TupleExpr tupleExpr, BindingSet bindings, Collection<SearchQ
}

// find the relevant outgoing patterns
StatementPattern typePattern, propertyPattern, scorePattern, snippetPattern;
StatementPattern typePattern, propertyPattern, scorePattern, snippetPattern, numDocsPattern;
List<StatementPattern> queryPatterns;

try {
Expand All @@ -161,6 +162,7 @@ public void process(TupleExpr tupleExpr, BindingSet bindings, Collection<SearchQ
propertyPattern = getPattern(matchesVar, filter.propertyPatterns);
scorePattern = getPattern(matchesVar, filter.scorePatterns);
snippetPattern = getPattern(matchesVar, filter.snippetPatterns);
numDocsPattern = getPattern(matchesVar, filter.numDocsPatterns);
} catch (IllegalArgumentException e) {
failOrWarn(e);
continue;
Expand Down Expand Up @@ -302,7 +304,8 @@ else if (propertyValue != null) {
queryString, propertyURI, null));
}

QuerySpec querySpec = new QuerySpec(matchesPattern, queries, scorePattern, typePattern, idPattern, subject);
QuerySpec querySpec = new QuerySpec(matchesPattern, queries, scorePattern, typePattern, idPattern,
numDocsPattern, subject);

if (querySpec.isEvaluable()) {
// constant optimizer
Expand Down Expand Up @@ -341,6 +344,10 @@ else if (propertyValue != null) {
funcCall.addArg(new ValueConstant(LuceneSailSchema.SNIPPET));
funcCall.addResultVar(snippetVar);
}
if (numDocsPattern != null) {
funcCall.addArg(new ValueConstant(LuceneSailSchema.NUM_DOCS));
funcCall.addArg(numDocsPattern.getObjectVar());
}

Join join = new Join();
matchesPattern.replaceWith(join);
Expand Down Expand Up @@ -465,6 +472,8 @@ private static class PatternFilter extends AbstractQueryModelVisitor<RuntimeExce

public ArrayList<StatementPattern> boostPatterns = new ArrayList<>();

public ArrayList<StatementPattern> numDocsPatterns = new ArrayList<>();

/**
* Method implementing the visitor pattern that gathers all statements using a predicate from the LuceneSail's
* namespace.
Expand All @@ -487,6 +496,8 @@ public void meet(StatementPattern node) {
idPatterns.add(node);
} else if (BOOST.equals(predicate)) {
boostPatterns.add(node);
} else if (NUM_DOCS.equals(predicate)) {
numDocsPatterns.add(node);
} else if (TYPE.equals(predicate)) {
Value object = node.getObjectVar().getValue();
if (LUCENE_QUERY.equals(object)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.BOOST;
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.LUCENE_QUERY;
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.MATCHES;
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.NUM_DOCS;
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.QUERY;
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.SCORE;
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.SNIPPET;
Expand Down Expand Up @@ -55,6 +56,7 @@ public void testQueryInterpretation() {
"<" + TYPE + "> <" + LUCENE_QUERY + ">; " +
"<" + QUERY + "> \"my Lucene query\"; " +
"<" + SCORE + "> ?Score; " +
"<" + NUM_DOCS + "> 76; " +
"<" + SNIPPET + "> ?Snippet ]. } ";
ParsedQuery query = parser.parseQuery(buffer, null);
TupleExpr tupleExpr = query.getTupleExpr();
Expand All @@ -69,6 +71,8 @@ public void testQueryInterpretation() {
assertEquals("Score", querySpec.getScorePattern().getObjectVar().getName());
assertEquals("Snippet", param.getSnippetPattern().getObjectVar().getName());
assertEquals(LUCENE_QUERY, querySpec.getTypePattern().getObjectVar().getValue());
assertEquals(76, querySpec.getNumDocs());
assertEquals(76, ((Literal) querySpec.getNumDocsPattern().getObjectVar().getValue()).intValue());
assertEquals("my Lucene query", param.getQuery());
assertNull(querySpec.getSubject());
}
Expand All @@ -80,11 +84,13 @@ public void testMultipleQueriesInterpretation() {
"<" + TYPE + "> <" + LUCENE_QUERY + ">; " +
"<" + QUERY + "> \"my Lucene query\"; " +
"<" + SCORE + "> ?score1; " +
"<" + NUM_DOCS + "> 86; " +
"<" + SNIPPET + "> ?snippet1 ]. " +
" ?sub2 <" + MATCHES + "> [ " +
"<" + TYPE + "> <" + LUCENE_QUERY + ">; " +
"<" + QUERY + "> \"second lucene query\"; " +
"<" + SCORE + "> ?score2; " +
"<" + NUM_DOCS + "> 13; " +
"<" + SNIPPET + "> ?snippet2 ]. " +
// and connect them both via any X in between, just as salt to make the
// parser do something
Expand All @@ -103,6 +109,7 @@ public void testMultipleQueriesInterpretation() {
// Matched the first
assertEquals("sub1", querySpec.getMatchesPattern().getSubjectVar().getName());
assertEquals(1, querySpec.getQueryPatterns().size());
assertEquals(86, querySpec.getNumDocs());
QuerySpec.QueryParam param = querySpec.getQueryPatterns().iterator().next();
assertEquals("my Lucene query",
((Literal) param.getQueryPattern().getObjectVar().getValue()).getLabel());
Expand All @@ -116,6 +123,7 @@ public void testMultipleQueriesInterpretation() {
// and the second
assertEquals("sub2", querySpec.getMatchesPattern().getSubjectVar().getName());
assertEquals(1, querySpec.getQueryPatterns().size());
assertEquals(13, querySpec.getNumDocs());
QuerySpec.QueryParam param = querySpec.getQueryPatterns().iterator().next();
assertEquals("second lucene query",
((Literal) param.getQueryPattern().getObjectVar().getValue()).getLabel());
Expand Down
Loading

0 comments on commit 08b898c

Please sign in to comment.