Filter example - split a single document into multiple documents

This example shows a simple filter plugin that splits a source document into multiple documents, which will each be indexed separately.

The example below shows a simple filter implementation and corresponding tests.

Although this example implements a StringDocumentFilter, the other filter types: ByteDocumentFilter and Filter can also produce multiple documents.

Example

In this example we split the input document into lines, create a new document for each line in the original document. As we are creating new documents we must assign a new URI for each new document, in this example we create a random URI for each new document. New documents are created by making clone of the original document, by doing this we preserve other parts of the document such as metadata. This example implements the StringDocumentFilter. We are required to implement canFilter(), used to check if the given document is a plain text document, as well as filterAsStringDocument() which contains the logic for the filter.

DocumentFilterSplitLines.java

package com.example.pluginexamples;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import com.funnelback.filter.api.FilterContext;
import com.funnelback.filter.api.FilterResult;
import com.funnelback.filter.api.documents.NoContentDocument;
import com.funnelback.filter.api.documents.StringDocument;
import com.funnelback.filter.api.filters.PreFilterCheck;
import com.funnelback.filter.api.filters.StringDocumentFilter;

import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;

public class DocumentFilterSplitLines implements StringDocumentFilter {

    private static final Logger log = LogManager.getLogger(DocumentFilterSplitLines.class);

    @Override
    public PreFilterCheck canFilter(NoContentDocument document, FilterContext filterContext) {
        //Only filter text documents.
        if(document.getDocumentType().asContentType().startsWith("text/plain")) {
            return PreFilterCheck.ATTEMPT_FILTER;
        }
        return PreFilterCheck.SKIP_FILTER;
    }

    @Override
    public FilterResult filterAsStringDocument(StringDocument document, FilterContext filterContext) {
        List<StringDocument> documents = new ArrayList<>();

        //Split the given document into lines and create a new document for each line.
        for (String line : document.getContentAsString().split("\n")) {
            //Create a random URL for our new document.
            String newUrl = "file://line/" + UUID.randomUUID();

            log.info("Creating document with URL: " + newUrl);

            //Create a new document with the new url and the new line preserving
            //all other attributes.
            StringDocument newDocument = document.cloneWithURI(URI.create(newUrl))
                    .cloneWithStringContent(document.getDocumentType(), line);
            documents.add(newDocument);
        }

        //Return all the documents we created.
        return FilterResult.of(documents);
    }
}

DocumentFilterSplitLinesTest.java

package com.example.pluginexamples;

import com.funnelback.filter.api.documents.FilterableDocument;
import org.junit.Assert;
import org.junit.Test;

import com.funnelback.filter.api.DocumentType;
import com.funnelback.filter.api.FilterResult;
import com.funnelback.filter.api.documents.StringDocument;
import com.funnelback.filter.api.mock.MockDocuments;
import com.funnelback.filter.api.mock.MockFilterContext;

import java.util.HashSet;
import java.util.List;
import java.util.Set;

public class DocumentFilterSplitLinesTest {

    @Test
    public void multipleLinesTest() {
        //Create document with three lines.
        StringDocument stringDocument = MockDocuments.mockEmptyStringDoc()
                .cloneWithStringContent(DocumentType.MIME_TEXT_PLAIN,
                        "line one\nline two\nline three");

        //Create and run the test.
        FilterResult filterResult = new DocumentFilterSplitLines().filter(stringDocument, MockFilterContext.getEmptyContext());

        Assert.assertEquals("Should have produced three documents as the original document had 3 lines",
                3, filterResult.getFilteredDocuments().size());

        List<FilterableDocument> results = filterResult.getFilteredDocuments();

        Assert.assertEquals("Check content of document one",
                "line one", ((StringDocument) results.get(0)).getContentAsString());
        Assert.assertEquals("Check content document two",
                "line two", ((StringDocument) results.get(1)).getContentAsString());
        Assert.assertEquals("Check content document three",
                "line three", ((StringDocument) results.get(2)).getContentAsString());

        //Dump all URLs into a Set and confirm each document got a unique URL
        Set<String> urls = new HashSet<>();
        urls.add(results.get(0).getURI().toASCIIString());
        urls.add(results.get(1).getURI().toASCIIString());
        urls.add(results.get(2).getURI().toASCIIString());

        Assert.assertEquals("Each produced document should have a unique URL", 3, urls.size());
    }
}

Help Center

Menu

Filter example - split a single document into multiple documents

Example

See also