Apache Ant/Converting PDF to XML
Appearance
Science and industry in Basic 7
Apache Ant Project to Extract Text From PDF
[edit | edit source]<project name="extract-text-from-pdf" default="extract-text-from-pdf">
<description>Sample invocations of Apache Tika</description>
<property name="lib.dir" value="../lib"/>
<property name="input-pdf-file" value="myDocument.pdf"/>
<property name="output-clean-xhtml-file" value="output-clean.xhtml"/>
<target name="extract-text-from-pdf">
<echo message="Extracting XML from PDF: ${input-pdf-file} to ${output-clean-xhtml-file}"/>
<java jar="${lib.dir}/tika-app-1.3.jar" fork="true" failonerror="true"
maxmemory="128m" input="${input-pdf-file}" output="${output-clean-xhtml-file}">
<arg value="-x" />
</java>
</target>
</project>