Filter example - binary to string conversion
This example shows a simple filter plugin that converts some binary form such as a pdf
to a String
containing HTML. Here we outline an example of being given a binary document and converting it to a String.
The example below shows a simple filter implementation and corresponding tests.
Example
In this example we simply convert all JSON
documents from raw bytes
to a String
assuming the charset is UTF-8
. This example implements the ByteDocumentFilter
. We are required to implement canFilter()
, used to check the document type is JSON to determine if the filter should run, as well as filterAsBytesDocument()
which contains the logic for the filter.
DocumentFilterBinaryToStringConversion.java
package com.example.pluginexamples;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.funnelback.filter.api.FilterContext;
import com.funnelback.filter.api.FilterResult;
import com.funnelback.filter.api.documents.NoContentDocument;
import com.funnelback.filter.api.documents.BytesDocument;
import com.funnelback.filter.api.filters.PreFilterCheck;
import com.funnelback.filter.api.filters.BytesDocumentFilter;
import com.funnelback.filter.api.DocumentType;
import static java.nio.charset.StandardCharsets.UTF_8;
public class DocumentFilterBinaryToStringConversion implements BytesDocumentFilter {
private static final Logger log = LogManager.getLogger(DocumentFilterBinaryToStringConversion.class);
@Override
public FilterResult filterAsBytesDocument(BytesDocument document, FilterContext context) {
log.trace("Get the document content as a byte array and convert it to a String assuming the bytes are UTF-8.");
// Get the document content as a byte array and convert it to a String
// assuming the bytes are UTF-8
byte[] documentContentAsBytes = document.getCopyOfContents();
String contentAsString = new String(documentContentAsBytes, UTF_8);
return FilterResult.of(context.getFilterDocumentFactory().toStringDocument(document,
DocumentType.MIME_APPLICATION_JSON_TEXT,
contentAsString));
}
}
DocumentFilterBinaryToStringConversionTest.java
package com.example.pluginexamples;
import com.funnelback.filter.api.documents.StringDocument;
import org.junit.Assert;
import org.junit.Test;
import com.funnelback.filter.api.DocumentType;
import com.funnelback.filter.api.FilterResult;
import com.funnelback.filter.api.documents.BytesDocument;
import com.funnelback.filter.api.mock.MockDocuments;
import com.funnelback.filter.api.mock.MockFilterContext;
import java.util.Optional;
import static java.nio.charset.StandardCharsets.UTF_8;
public class DocumentFilterBinaryToStringConversionTest {
@Test
public void testJSONIsConverted() throws Exception {
String expected = "{\"accents\": \"é à ê\"}";
// Create the dummy JSON input document
BytesDocument inputDoc = MockDocuments.mockByteDoc(
"http://foo.com/",
DocumentType.APPLICATION_JSON_TEXT,
Optional.empty(),
expected.getBytes("UTF-8"));
// Create and run the filter
FilterResult filterResult = new DocumentFilterBinaryToStringConversion()
.filter(inputDoc, MockFilterContext.getEmptyContext());
// Get the resulting filtered document from the filter result
// (we assume a document will be returned)
StringDocument filteredDocument = (StringDocument) filterResult
.getFilteredDocuments().get(0);
Assert.assertEquals(
"Content was not correctly converted to a string",
expected,
filteredDocument.getContentAsString());
}
@Test
public void testFilterOnlyRunsOnJsonDocuments() throws Exception {
// Create a dummy HTML input document.
BytesDocument inputDoc = MockDocuments.mockEmptyByteDoc()
.cloneWithContent(DocumentType.MIME_HTML_TEXT, Optional.empty(), "<html><p>Hello</p></html>".getBytes(UTF_8));
// Create and run the filter.
FilterResult filterResult = new DocumentFilterBinaryToStringConversion()
.filter(inputDoc, MockFilterContext.getEmptyContext());
Assert.assertTrue(
"Filter should have been skipped as the document was not a JSON document",
filterResult.isSkipped());
}
}