Filter example - split a single HTML document into multiple documents

This example shows a simple filter plugin that splits an HTML source document into multiple documents, which will each be indexed separately.

The example uses Jsoup (the same library as used by Jsoup filters), which is applied within the string document filter to convert the HTML document into an HTML document object that can then be iterated over.

HTML document (Jsoup) filters should normally be used for working with HTML documents. Splitting a HTML document is an example where you must use a general document filter as HTML document (Jsoup) filters don’t support the splitting of documents.

The example below shows a simple filter implementation and corresponding tests.

Example

In this example we split the input HTML document on the articles, creating a new document for each article in the original document. As we are creating new documents we must assign a new URI for each new document, in this example a URI is provided from the source document. New documents are created by making clones of the original document, by doing this we preserve other parts of the document such as metadata. This example implements the StringDocumentFilter. We are required to implement canFilter(), used to check if the given document is a HTML document, as well as filterAsStringDocument() which contains the logic for the filter.

DocumentFilterSplitHtmlDocument.java
package com.example.pluginexamples;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import com.funnelback.filter.api.FilterContext;
import com.funnelback.filter.api.FilterResult;
import com.funnelback.filter.api.documents.NoContentDocument;
import com.funnelback.filter.api.documents.FilterableDocument;
import com.funnelback.filter.api.documents.StringDocument;
import com.funnelback.filter.api.filters.PreFilterCheck;
import com.funnelback.filter.api.filters.StringDocumentFilter;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import java.net.URI;
import java.util.ArrayList;

public class DocumentFilterSplitHtmlDocument implements StringDocumentFilter {

    private static final Logger log = LogManager.getLogger(DocumentFilterSplitHtmlDocument.class);

    @Override
    public PreFilterCheck canFilter(NoContentDocument document, FilterContext filterContext) {
        if(document.getDocumentType().isHTML()) {
            // Only run on HTML documents
            return PreFilterCheck.ATTEMPT_FILTER;
        }
        return PreFilterCheck.SKIP_FILTER;
    }

    @Override
    public FilterResult filterAsStringDocument(StringDocument document, FilterContext filterContext) {
        // Create a jsoup object from the string document
        Document jsoupDoc = Jsoup.parse(document.getContentAsString(), document.getURI().toString());

        // Look for all the articles which represents items which we intend to split on

        ArrayList<FilterableDocument> docs = new ArrayList<>();

        for(Element element : jsoupDoc.select("article")) {
            // Get the url
            URI uri = URI.create(element.select("a").attr("href"));

            log.info("Creating document for article with URL: " + uri.toASCIIString());

            // Create a basic html page
            Document article = Jsoup.parse("<html><head></head><body></body></html>");

            // Insert the article into the basic html page
            article.body().html(element.html());

            // Clone the existing document with the new URI and content, preserving all other
            // attributes including meta data
            docs.add(document.cloneWithURI(uri).cloneWithStringContent(document.getDocumentType(), article.html()));
        }
        return FilterResult.of(docs);
    }
}
DocumentFilterSplitHtmlDocumentTest.java
package com.example.pluginexamples;

import org.junit.Assert;
import org.junit.Test;

import com.funnelback.filter.api.DocumentType;
import com.funnelback.filter.api.FilterResult;
import com.funnelback.filter.api.documents.StringDocument;
import com.funnelback.filter.api.mock.MockDocuments;
import com.funnelback.filter.api.mock.MockFilterContext;

public class DocumentFilterSplitHtmlDocumentTest {

    @Test
    public void splitArticlesTest() {
        //Create a input document with two articles
        StringDocument inputDoc = MockDocuments.mockEmptyStringDoc()
                .cloneWithStringContent(DocumentType.MIME_HTML_TEXT,
                        "<html><head></head><body>\n"
                                +"<article>\n"
                                +"<a href=\"http://articale.com/one\">Article one</a> \n"
                                +"<p>Article one stuff</p>\n"
                                +"</article>\n"
                                +"<article>\n"
                                +"<a href=\"http://articale.com/two\">Article two</a> \n"
                                +"<p>Article two stuff</p>\n"
                                +"</article>\n"
                                +"</body></html>");

        FilterResult filterResult = new DocumentFilterSplitHtmlDocument().filter(inputDoc, MockFilterContext.getEmptyContext());

        Assert.assertEquals("Should have returned two documents one for each article",
                2, filterResult.getFilteredDocuments().size());

        //Check article one
        StringDocument articleOne = (StringDocument) filterResult.getFilteredDocuments().get(0);

        Assert.assertEquals("Article one URL is wrong",
                "http://articale.com/one", articleOne.getURI().toASCIIString());

        Assert.assertTrue("Check article one has article one content",
                articleOne.getContentAsString().contains("Article one stuff"));

        Assert.assertFalse("Check article one does NOT have article two content",
                articleOne.getContentAsString().contains("Article two stuff"));

        //Check article two
        StringDocument articleTwo = (StringDocument) filterResult.getFilteredDocuments().get(1);

        Assert.assertEquals("Article two URL is wrong",
                "http://articale.com/two", articleTwo.getURI().toASCIIString());

        Assert.assertTrue("Check article two has article two content",
                articleTwo.getContentAsString().contains("Article two stuff"));

        Assert.assertFalse("Check article two does NOT have article one content",
                articleTwo.getContentAsString().contains("Article one stuff"));
    }
}