Filter example - remove a document

This example shows a simple filter plugin that prevents a document from appearing in the search index by intercepting the document and discarding it before it is stored. The example looks for documents that resemble a sitemap.xml file and discards these.

The example below shows a simple filter implementation and corresponding tests.

Although this example implements a Filter, the other filter types: StringDocumentFilter and ByteDocumentFilter can also be used to remove documents.

Table of Contents

Example

In this example we remove any document that looks like a sitemap file. This example implements Filter which requires the filter() method to be implemented.

DocumentFilterRemoveDocument.java
package com.example.pluginexamples;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import com.funnelback.filter.api.FilterContext;
import com.funnelback.filter.api.FilterResult;
import com.funnelback.filter.api.documents.NoContentDocument;
import com.funnelback.filter.api.filters.PreFilterCheck;
import com.funnelback.filter.api.filters.Filter;
import com.funnelback.filter.api.documents.FilterableDocument;
import com.funnelback.filter.api.filters.FilterException;

public class DocumentFilterRemoveDocument implements Filter {

    private static final Logger log = LogManager.getLogger(DocumentFilterRemoveDocument.class);

    @Override
    public FilterResult filter(FilterableDocument document, FilterContext context) throws RuntimeException,
            FilterException {
        //Assume any URL path that ends with sitemap.xml is a site map URL.
        if(document.getURI().getPath().endsWith("sitemap.xml")){
            //Returning this removes the document, when crawling the document will
            //not be stored.
            return FilterResult.delete();
        }

        log.debug("Keeping document with URL: " + document.getURI() + " as it is not a site map.");

        return FilterResult.of(document);
    }
}
DocumentFilterRemoveDocumentTest.java
package com.example.pluginexamples;

import org.junit.Assert;
import org.junit.Test;

import com.funnelback.filter.api.FilterResult;
import com.funnelback.filter.api.documents.StringDocument;
import com.funnelback.filter.api.mock.MockDocuments;
import com.funnelback.filter.api.mock.MockFilterContext;

import java.net.URI;

public class DocumentFilterRemoveDocumentTest {

    @Test
    public void removeSitemapTest() {
        StringDocument inputDoc = MockDocuments.mockEmptyStringDoc()
                .cloneWithURI(URI.create("https://foo.com/sitemap.xml"));
        FilterResult filterResult = new DocumentFilterRemoveDocument().filter(inputDoc,
                MockFilterContext.getEmptyContext());

        Assert.assertEquals("No documents should have been returned as sitemap URLs should be removed",
                0, filterResult.getFilteredDocuments().size());
    }

    @Test
    public void keepsNonSitemapDocumentsTest() {
        StringDocument inputDoc = MockDocuments.mockEmptyStringDoc()
                .cloneWithURI(URI.create("https://foo.com/hello.html"));
        FilterResult filterResult = new DocumentFilterRemoveDocument().filter(inputDoc,
                MockFilterContext.getEmptyContext());

        Assert.assertEquals("One document should have been returned by the filter",
                1, filterResult.getFilteredDocuments().size());

        Assert.assertEquals("We should have returned the original document without modification",
                inputDoc, filterResult.getFilteredDocuments().get(0));
    }
}