Filter example - remove a document
This example shows a simple filter plugin that prevents a document from appearing in the search index by intercepting the document and discarding it before it is stored. The example looks for documents that resemble a sitemap.xml file and discards these.
The example below shows a simple filter implementation and corresponding tests.
Although this example implements a Filter
, the other filter types: StringDocumentFilter
and ByteDocumentFilter
can also be used to remove documents.
Example
In this example we remove any document that looks like a sitemap
file. This example implements Filter
which requires the filter()
method to be implemented.
DocumentFilterRemoveDocument.java
package com.example.pluginexamples;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.funnelback.filter.api.FilterContext;
import com.funnelback.filter.api.FilterResult;
import com.funnelback.filter.api.documents.NoContentDocument;
import com.funnelback.filter.api.filters.PreFilterCheck;
import com.funnelback.filter.api.filters.Filter;
import com.funnelback.filter.api.documents.FilterableDocument;
import com.funnelback.filter.api.filters.FilterException;
public class DocumentFilterRemoveDocument implements Filter {
private static final Logger log = LogManager.getLogger(DocumentFilterRemoveDocument.class);
@Override
public FilterResult filter(FilterableDocument document, FilterContext context) throws RuntimeException,
FilterException {
//Assume any URL path that ends with sitemap.xml is a site map URL.
if(document.getURI().getPath().endsWith("sitemap.xml")){
//Returning this removes the document, when crawling the document will
//not be stored.
return FilterResult.delete();
}
log.debug("Keeping document with URL: " + document.getURI() + " as it is not a site map.");
return FilterResult.of(document);
}
}
DocumentFilterRemoveDocumentTest.java
package com.example.pluginexamples;
import org.junit.Assert;
import org.junit.Test;
import com.funnelback.filter.api.FilterResult;
import com.funnelback.filter.api.documents.StringDocument;
import com.funnelback.filter.api.mock.MockDocuments;
import com.funnelback.filter.api.mock.MockFilterContext;
import java.net.URI;
public class DocumentFilterRemoveDocumentTest {
@Test
public void removeSitemapTest() {
StringDocument inputDoc = MockDocuments.mockEmptyStringDoc()
.cloneWithURI(URI.create("https://foo.com/sitemap.xml"));
FilterResult filterResult = new DocumentFilterRemoveDocument().filter(inputDoc,
MockFilterContext.getEmptyContext());
Assert.assertEquals("No documents should have been returned as sitemap URLs should be removed",
0, filterResult.getFilteredDocuments().size());
}
@Test
public void keepsNonSitemapDocumentsTest() {
StringDocument inputDoc = MockDocuments.mockEmptyStringDoc()
.cloneWithURI(URI.create("https://foo.com/hello.html"));
FilterResult filterResult = new DocumentFilterRemoveDocument().filter(inputDoc,
MockFilterContext.getEmptyContext());
Assert.assertEquals("One document should have been returned by the filter",
1, filterResult.getFilteredDocuments().size());
Assert.assertEquals("We should have returned the original document without modification",
inputDoc, filterResult.getFilteredDocuments().get(0));
}
}