java读取pdf(可分页读取) - JAVA

需要pdfbox和log4j的包
举个例子：
import org.pdfbox.pdfparser.*;
import org.pdfbox.util.PDFTextStripper;
import java.io.*;
/**
* 测试pdfbox
* @author kingfish
* @version 1.0
*/
public class TestPdf {
public static void main(String[] args) throws Exception{
FileInputStream fis = new FileInputStream("c://intro.pdf");
PDFParser p = new PDFParser(fis);
p.parse();
PDFTextStripper ts = new PDFTextStripper();
String s = ts.getText(p.getPDDocument());
System.out.println(s);
fis.close();
}
}

--------------------------------------------------------------------------------

import java.io.*;
import java.util.*;
import com.etymon.pj.*;
import com.etymon.pj.object.*;
import com.etymon.pj.exception.*;
/**
* This is a wrapper for the Pj PDF parser
*/
public class PjWrapper {
Pdf pdf;
PjCatalog catalog;
PjPagesNode rootPage;
public PjWrapper(String PdfFileName,String TextFileName)throws
IOException, PjException {
pdf = new Pdf(PdfFileName);
// hopefully the catalog can never be a reference...
catalog = (PjCatalog) pdf.getObject(pdf.getCatalog());
// root node of pages tree is specified by a reference in the catalog
rootPage = (PjPagesNode) pdf.resolve(catalog.getPages());
}
public static void main (String [] args) throws IOException, PjException
{
/*PjWrapper testWrapper = new PjWrapper(args[0]);
LinkedList textList = testWrapper.getAllText();*/
}
/**
* Returns as much text as we can extract from the PDF.
* This currently includes:
*
* NOTE: Pj does not support LZW, so some text in some PDF's may not
* be indexable
*/
public LinkedList getAllText() throws PjException {
LinkedList stringList = new LinkedList();
Iterator streamIter = getAllContentsStreams().iterator();
PjStream stream;
String streamData;

String streamText;
boolean moreData;
int textStart, textEnd;
//System.out.println("Going through streams...");
while(streamIter.hasNext()) {
//System.out.println("Getting next stream");
stream = (PjStream) streamIter.next();
//System.out.println("Adding text from stream with filter: "
+getFilterString(stream);
stream = stream.flateDecompress();
//System.out.println("Adding text from stream with filter
afterdecompress: " + getFilterString(stream));
streamData = new String(stream.getBuffer());
streamText = new String();
moreData = true;
textStart = textEnd = 0;
while(moreData) {
if ((textStart = streamData.indexOf('(', textEnd + 1)) < 0) {
moreData = false;
break;
}
if ((textEnd = streamData.indexOf(')', textStart + 1)) < 0) {
moreData = false;
break;
}
try {
streamText +=
PjString.decodePdf(streamData.substring(textStart,textEnd + 1));
} catch (Exception e) {
System.out.println("malformed string: " +
streamData.substring(textStart, textEnd + 1));
}
}
//if(streamText.equals("inserted text"))
System.out.println(streamText);
if (streamText.length() > 0)
stringList.add(streamText);
}
return stringList;
}
public static String getFilterString(PjStream stream) throws PjException
{
String filterString = new String();
PjObject filter;
//System.out.println("getting filter from dictionary");
if ((filter = stream.getStreamDictionary().getFilter()) == null) {
//System.out.println("Got null filter");
return "";
}
//System.out.println("got it");
// filter should either be a name or an array of names
if (filter instanceof PjName) {
//System.out.println("getting filter string from simple name");
filterString = ((PjName) filter).getString();
} else {
//

java读取pdf(可分页读取)(一)