Filter example - add metadata

This simple example shows the code for a simple filter plugin, which shows how to add metadata to a document.

The example below shows a simple filter implementation and corresponding tests.

Filters can be used to add metadata to documents. Although in this example StringDocumentFilter is implemented, both ByteDocumentFilter and Filter can be used to add metadata to a document.

Example

In this example we count the number of occurrences of the word 'Elvis' and store this count in the document metadata. This example implements the StringDocumentFilter. We are required to implement canFilter(), used to check if the filter should be run, as well as filterAsBytesDocument() which contains the logic for the filter.

This plugin implements the filtering and indexing plugin templates, with the filtering template implementing the filter logic and tests, and the indexing template implementing an automatic metadata mapping.

DocumentFilterExtractAddMetadata.java
package com.example.pluginexamples;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import com.funnelback.filter.api.FilterContext;
import com.funnelback.filter.api.FilterResult;
import com.funnelback.filter.api.documents.NoContentDocument;
import com.funnelback.filter.api.documents.StringDocument;
import com.funnelback.filter.api.filters.PreFilterCheck;
import com.funnelback.filter.api.filters.StringDocumentFilter;
import com.google.common.collect.ListMultimap;

public class DocumentFilterExtractAddMetadata implements StringDocumentFilter {

    private static final Logger log = LogManager.getLogger(DocumentFilterExtractAddMetadata.class);

    @Override
    public PreFilterCheck canFilter(NoContentDocument document, FilterContext filterContext) {
        // Always run filter
        return PreFilterCheck.ATTEMPT_FILTER;
    }

    @Override
    public FilterResult filterAsStringDocument(StringDocument document, FilterContext filterContext) {
        // Work out how many times Elvis appears.
        int elvisCount = document.getContentAsString()
                .toLowerCase()
                .split("elvis")
                .length - 1;

        log.debug("Found: " + elvisCount + " counts of Elvis in " + document.getURI());

        // Ensure we get the existing metadata from the document, to preserve existing metadata
        ListMultimap<String, String> metadata = document.getCopyOfMetadata();

        // The metadata value is an array, we first remove all entries from that array before adding a single count to the array.
        metadata.removeAll("elvis-count");
        metadata.put("elvis-count", Integer.toString(elvisCount));

        //Create a new document with the updated metadata.
        StringDocument documentWithElvisCount = document.cloneWithMetadata(metadata);

        return FilterResult.of(documentWithElvisCount);
    }
}
DocumentFilterExtractAddMetadataTest.java
package com.example.pluginexamples;

import org.junit.Assert;
import org.junit.Test;

import com.funnelback.filter.api.DocumentType;
import com.funnelback.filter.api.FilterResult;
import com.funnelback.filter.api.documents.StringDocument;
import com.funnelback.filter.api.documents.FilterableDocument;
import com.funnelback.filter.api.mock.MockDocuments;
import com.funnelback.filter.api.mock.MockFilterContext;

public class DocumentFilterExtractAddMetadataTest {

    @Test
    public void testMetadataIsSet() {
        var mockContext = MockFilterContext.getEmptyContext();

        var content = "<p>test</p><p>elvis</p><p>foo</p>";
        StringDocument doc = MockDocuments.mockStringDoc("http://foo.com/", DocumentType.MIME_HTML_TEXT, content);

        DocumentFilterExtractAddMetadata filter = new DocumentFilterExtractAddMetadata();

        FilterResult result = filter.filter(doc, mockContext);
        FilterableDocument resultDocument = result.getFilteredDocuments().get(0);

        var metadata = resultDocument.getMetadata();
        Assert.assertEquals(1, metadata.get("elvis-count"));
    }

    @Test
    public void testNoMetadata() {
        var mockContext = MockFilterContext.getEmptyContext();

        var content = "<p>test</p><p>bar</p><p>foo</p>";
        StringDocument doc = MockDocuments.mockStringDoc("http://foo.com/", DocumentType.MIME_HTML_TEXT, content);

        DocumentFilterExtractAddMetadata filter = new DocumentFilterExtractAddMetadata();

        FilterResult result = filter.filter(doc, mockContext);
        FilterableDocument resultDocument = result.getFilteredDocuments().get(0);

        var metadata = resultDocument.getMetadata();
        Assert.assertTrue(metadata.get("elvis-count").isEmpty());
    }

}

Metadata mapping

The above example added metadata with the name elvis-count to the document. For the metadata to be available, it needs to be added to the data source metadata mappings.

To avoid manual mapping requirements the plugin should also implement the plugin indexing interface and use the metadata mapping methods to set up metadata mappings when the plugin is enabled.

AddMetadataExampleIndexingConfigProvider.java
package com.example.plugin.addmetadataexample;

import com.funnelback.plugin.index.IndexConfigProviderContext;
import com.funnelback.plugin.index.IndexingConfigProvider;
import com.funnelback.plugin.index.consumers.MetadataMappingConsumer;
import com.funnelback.plugin.index.model.metadatamapping.MetadataSourceType;
import com.funnelback.plugin.index.model.metadatamapping.MetadataType;

public class AddMetadataExampleIndexingConfigProvider implements IndexingConfigProvider {

    @Override
    public void metadataMappings(IndexConfigProviderContext context, MetadataMappingConsumer consumer) {
        consumer.map("elvis-count", MetadataType.TEXT_INDEXED_AS_DOCUMENT_CONTENT, MetadataSourceType.HTML_OR_HTTP_HEADERS, "elvis-count");
    }
}

with corresponding test:

AddMetadataExampleIndexingConfigProviderTest.java
package com.example.plugin.addmetadataexample;

import org.junit.Assert;
import org.junit.Test;

import com.funnelback.plugin.index.consumers.mock.MockMetadataMappingConsumer;
import com.funnelback.plugin.index.mock.MockIndexConfigProviderContext;

public class AddMetadataExampleIndexingConfigProviderTest {

    @Test
    public void metadataMappingsTest() {
        MockIndexConfigProviderContext mockContext = new MockIndexConfigProviderContext();
        MockMetadataMappingConsumer mockConsumer = new MockMetadataMappingConsumer();
        AddMetadataExampleIndexingConfigProvider underTest = new AddMetadataExampleIndexingConfigProvider();

        underTest.metadataMappings(mockContext, mockConsumer);

        Assert.assertEquals("Check how many times the consumer was called.", 1, mockConsumer.getInvocations().size());

        var invocations = mockConsumer.getInvocations();
        Assert.assertEquals("elvis-count", invocations.get(0).getMetadataClass());
    }

}