Filter example - add metadata
This simple example shows the code for a simple filter plugin, which shows how to add metadata to a document.
The example below shows a simple filter implementation and corresponding tests.
Filters can be used to add metadata to documents. Although in this example StringDocumentFilter
is implemented, both ByteDocumentFilter
and Filter
can be used to add metadata to a document.
Example
In this example we count the number of occurrences of the word 'Elvis' and store this count in the document metadata. This example implements the StringDocumentFilter
. We are required to implement canFilter()
, used to check if the filter should be run, as well as filterAsBytesDocument()
which contains the logic for the filter.
This plugin implements the filtering
and indexing
plugin templates, with the filtering
template implementing the filter logic and tests, and the indexing
template implementing an automatic metadata mapping.
DocumentFilterExtractAddMetadata.java
package com.example.pluginexamples;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.funnelback.filter.api.FilterContext;
import com.funnelback.filter.api.FilterResult;
import com.funnelback.filter.api.documents.NoContentDocument;
import com.funnelback.filter.api.documents.StringDocument;
import com.funnelback.filter.api.filters.PreFilterCheck;
import com.funnelback.filter.api.filters.StringDocumentFilter;
import com.google.common.collect.ListMultimap;
public class DocumentFilterExtractAddMetadata implements StringDocumentFilter {
private static final Logger log = LogManager.getLogger(DocumentFilterExtractAddMetadata.class);
@Override
public PreFilterCheck canFilter(NoContentDocument document, FilterContext filterContext) {
// Always run filter
return PreFilterCheck.ATTEMPT_FILTER;
}
@Override
public FilterResult filterAsStringDocument(StringDocument document, FilterContext filterContext) {
// Work out how many times Elvis appears.
int elvisCount = document.getContentAsString()
.toLowerCase()
.split("elvis")
.length - 1;
log.debug("Found: " + elvisCount + " counts of Elvis in " + document.getURI());
// Ensure we get the existing metadata from the document, to preserve existing metadata
ListMultimap<String, String> metadata = document.getCopyOfMetadata();
// The metadata value is an array, we first remove all entries from that array before adding a single count to the array.
metadata.removeAll("elvis-count");
metadata.put("elvis-count", Integer.toString(elvisCount));
//Create a new document with the updated metadata.
StringDocument documentWithElvisCount = document.cloneWithMetadata(metadata);
return FilterResult.of(documentWithElvisCount);
}
}
DocumentFilterExtractAddMetadataTest.java
package com.example.pluginexamples;
import org.junit.Assert;
import org.junit.Test;
import com.funnelback.filter.api.DocumentType;
import com.funnelback.filter.api.FilterResult;
import com.funnelback.filter.api.documents.StringDocument;
import com.funnelback.filter.api.documents.FilterableDocument;
import com.funnelback.filter.api.mock.MockDocuments;
import com.funnelback.filter.api.mock.MockFilterContext;
public class DocumentFilterExtractAddMetadataTest {
@Test
public void testMetadataIsSet() {
var mockContext = MockFilterContext.getEmptyContext();
var content = "<p>test</p><p>elvis</p><p>foo</p>";
StringDocument doc = MockDocuments.mockStringDoc("http://foo.com/", DocumentType.MIME_HTML_TEXT, content);
DocumentFilterExtractAddMetadata filter = new DocumentFilterExtractAddMetadata();
FilterResult result = filter.filter(doc, mockContext);
FilterableDocument resultDocument = result.getFilteredDocuments().get(0);
var metadata = resultDocument.getMetadata();
Assert.assertEquals(1, metadata.get("elvis-count"));
}
@Test
public void testNoMetadata() {
var mockContext = MockFilterContext.getEmptyContext();
var content = "<p>test</p><p>bar</p><p>foo</p>";
StringDocument doc = MockDocuments.mockStringDoc("http://foo.com/", DocumentType.MIME_HTML_TEXT, content);
DocumentFilterExtractAddMetadata filter = new DocumentFilterExtractAddMetadata();
FilterResult result = filter.filter(doc, mockContext);
FilterableDocument resultDocument = result.getFilteredDocuments().get(0);
var metadata = resultDocument.getMetadata();
Assert.assertTrue(metadata.get("elvis-count").isEmpty());
}
}
Metadata mapping
The above example added metadata with the name elvis-count
to the document. For the metadata to be available, it needs to be added to the data source metadata mappings.
To avoid manual mapping requirements the plugin should also implement the plugin indexing
interface and use the metadata mapping methods to set up metadata mappings when the plugin is enabled.
AddMetadataExampleIndexingConfigProvider.java
package com.example.plugin.addmetadataexample;
import com.funnelback.plugin.index.IndexConfigProviderContext;
import com.funnelback.plugin.index.IndexingConfigProvider;
import com.funnelback.plugin.index.consumers.MetadataMappingConsumer;
import com.funnelback.plugin.index.model.metadatamapping.MetadataSourceType;
import com.funnelback.plugin.index.model.metadatamapping.MetadataType;
public class AddMetadataExampleIndexingConfigProvider implements IndexingConfigProvider {
@Override
public void metadataMappings(IndexConfigProviderContext context, MetadataMappingConsumer consumer) {
consumer.map("elvis-count", MetadataType.TEXT_INDEXED_AS_DOCUMENT_CONTENT, MetadataSourceType.HTML_OR_HTTP_HEADERS, "elvis-count");
}
}
with corresponding test:
AddMetadataExampleIndexingConfigProviderTest.java
package com.example.plugin.addmetadataexample;
import org.junit.Assert;
import org.junit.Test;
import com.funnelback.plugin.index.consumers.mock.MockMetadataMappingConsumer;
import com.funnelback.plugin.index.mock.MockIndexConfigProviderContext;
public class AddMetadataExampleIndexingConfigProviderTest {
@Test
public void metadataMappingsTest() {
MockIndexConfigProviderContext mockContext = new MockIndexConfigProviderContext();
MockMetadataMappingConsumer mockConsumer = new MockMetadataMappingConsumer();
AddMetadataExampleIndexingConfigProvider underTest = new AddMetadataExampleIndexingConfigProvider();
underTest.metadataMappings(mockContext, mockConsumer);
Assert.assertEquals("Check how many times the consumer was called.", 1, mockConsumer.getInvocations().size());
var invocations = mockConsumer.getInvocations();
Assert.assertEquals("elvis-count", invocations.get(0).getMetadataClass());
}
}