Filter example - split a single HTML document into multiple documents
This example shows a simple filter plugin that splits an HTML source document into multiple documents, which will each be indexed separately.
The example uses Jsoup (the same library as used by Jsoup filters), which is applied within the string document filter to convert the HTML document into an HTML document object that can then be iterated over.
HTML document (Jsoup) filters should normally be used for working with HTML documents. Splitting a HTML document is an example where you must use a general document filter as HTML document (Jsoup) filters don’t support the splitting of documents. |
The example below shows a simple filter implementation and corresponding tests.
Example
In this example we split the input HTML document on the articles
, creating a new document for each article
in the original document. As we are creating new documents we must assign a new URI
for each new document, in this example a URI is provided from the source document. New documents are created by making clones of the original document, by doing this we preserve other parts of the document such as metadata. This example implements the StringDocumentFilter
. We are required to implement canFilter()
, used to check if the given document is a HTML document, as well as filterAsStringDocument()
which contains the logic for the filter.
DocumentFilterSplitHtmlDocument.java
package com.example.pluginexamples;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.funnelback.filter.api.FilterContext;
import com.funnelback.filter.api.FilterResult;
import com.funnelback.filter.api.documents.NoContentDocument;
import com.funnelback.filter.api.documents.FilterableDocument;
import com.funnelback.filter.api.documents.StringDocument;
import com.funnelback.filter.api.filters.PreFilterCheck;
import com.funnelback.filter.api.filters.StringDocumentFilter;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.net.URI;
import java.util.ArrayList;
public class DocumentFilterSplitHtmlDocument implements StringDocumentFilter {
private static final Logger log = LogManager.getLogger(DocumentFilterSplitHtmlDocument.class);
@Override
public PreFilterCheck canFilter(NoContentDocument document, FilterContext filterContext) {
if(document.getDocumentType().isHTML()) {
// Only run on HTML documents
return PreFilterCheck.ATTEMPT_FILTER;
}
return PreFilterCheck.SKIP_FILTER;
}
@Override
public FilterResult filterAsStringDocument(StringDocument document, FilterContext filterContext) {
// Create a jsoup object from the string document
Document jsoupDoc = Jsoup.parse(document.getContentAsString(), document.getURI().toString());
// Look for all the articles which represents items which we intend to split on
ArrayList<FilterableDocument> docs = new ArrayList<>();
for(Element element : jsoupDoc.select("article")) {
// Get the url
URI uri = URI.create(element.select("a").attr("href"));
log.info("Creating document for article with URL: " + uri.toASCIIString());
// Create a basic html page
Document article = Jsoup.parse("<html><head></head><body></body></html>");
// Insert the article into the basic html page
article.body().html(element.html());
// Clone the existing document with the new URI and content, preserving all other
// attributes including meta data
docs.add(document.cloneWithURI(uri).cloneWithStringContent(document.getDocumentType(), article.html()));
}
return FilterResult.of(docs);
}
}
DocumentFilterSplitHtmlDocumentTest.java
package com.example.pluginexamples;
import org.junit.Assert;
import org.junit.Test;
import com.funnelback.filter.api.DocumentType;
import com.funnelback.filter.api.FilterResult;
import com.funnelback.filter.api.documents.StringDocument;
import com.funnelback.filter.api.mock.MockDocuments;
import com.funnelback.filter.api.mock.MockFilterContext;
public class DocumentFilterSplitHtmlDocumentTest {
@Test
public void splitArticlesTest() {
//Create a input document with two articles
StringDocument inputDoc = MockDocuments.mockEmptyStringDoc()
.cloneWithStringContent(DocumentType.MIME_HTML_TEXT,
"<html><head></head><body>\n"
+"<article>\n"
+"<a href=\"http://articale.com/one\">Article one</a> \n"
+"<p>Article one stuff</p>\n"
+"</article>\n"
+"<article>\n"
+"<a href=\"http://articale.com/two\">Article two</a> \n"
+"<p>Article two stuff</p>\n"
+"</article>\n"
+"</body></html>");
FilterResult filterResult = new DocumentFilterSplitHtmlDocument().filter(inputDoc, MockFilterContext.getEmptyContext());
Assert.assertEquals("Should have returned two documents one for each article",
2, filterResult.getFilteredDocuments().size());
//Check article one
StringDocument articleOne = (StringDocument) filterResult.getFilteredDocuments().get(0);
Assert.assertEquals("Article one URL is wrong",
"http://articale.com/one", articleOne.getURI().toASCIIString());
Assert.assertTrue("Check article one has article one content",
articleOne.getContentAsString().contains("Article one stuff"));
Assert.assertFalse("Check article one does NOT have article two content",
articleOne.getContentAsString().contains("Article two stuff"));
//Check article two
StringDocument articleTwo = (StringDocument) filterResult.getFilteredDocuments().get(1);
Assert.assertEquals("Article two URL is wrong",
"http://articale.com/two", articleTwo.getURI().toASCIIString());
Assert.assertTrue("Check article two has article two content",
articleTwo.getContentAsString().contains("Article two stuff"));
Assert.assertFalse("Check article two does NOT have article one content",
articleTwo.getContentAsString().contains("Article one stuff"));
}
}