本文实例讲述了C#使用iTextSharp将PDF转成文本的方法。分享给大家供大家参考。具体实现方法如下:
using System;
using System.IO;
using iTextSharp.text;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
public class ParsingPDF {
static string PDF;
static string TEXT2;
/**
* Parses the PDF using PRTokeniser
* @param src the path to the original PDF file
* @param dest the path to the resulting text file
*/
public void parsePdf(String src, String dest)
{
PdfReader reader = new PdfReader(src);
StreamWriter output = new StreamWriter(new FileStream(dest, FileMode.Create));
int pageCount = reader.NumberOfPages;
for (int pg = 1; pg <= pageCount; pg++)
{
// we can inspect the syntax of the imported page
byte[] streamBytes = reader.GetPageContent(pg);
PRTokeniser tokenizer = new PRTokeniser(streamBytes);
while (tokenizer.NextToken())
{
if (tokenizer.TokenType == PRTokeniser.TokType.STRING)
{
output.WriteLine(tokenizer.StringValue);
}
}
}
output.Flush();
output.Close();
}
/**
* Main method.
*/
static void Main(string[] args)
{
if (args.Length < 1 || args.Length > 2)
{
Console.WriteLine("USAGE: ParsePDF infile.pdf <outfile.txt>");
return;
}
else if (args.Length == 1)
{
PDF = args[0];
TEXT2 = Path.GetFileNameWithoutExtension(PDF) + ".txt";
}
else
{
PDF = args[0];
TEXT2 = args[1];
}
try
{
DateTime t1 = DateTime.Now;
ParsingPDF example = new ParsingPDF();
example.parsePdf(PDF, TEXT2);
DateTime t2 = DateTime.Now;
TimeSpan ts = t2 - t1;
Console.WriteLine("Parsing completed in {0:0.00} seconds.", ts.TotalSeconds);
}
catch (Exception ex)
{
Console.WriteLine("ERROR: " + ex.Message);
}
} // class
public class MyTextRenderListener : IRenderListener
{
/** The print writer to which the information will be written. */
protected StreamWriter output;
/**
* Creates a RenderListener that will look for text.
*/
public MyTextRenderListener(StreamWriter output)
{
this.output = output;
}
public void BeginTextBlock()
{
output.Write("<");
}
public void EndTextBlock()
{
output.WriteLine(">");
}
public void RenderImage(ImageRenderInfo renderInfo)
{
}
public void RenderText(TextRenderInfo renderInfo)
{
output.Write("<");
output.Write(renderInfo.GetText());
output.Write(">");
}
} // class
} // namespace
希望本文所述对大家的C#程序设计有所帮助。
您可能感兴趣的文章:c#实现将pdf转文本的示例分享利用C#如何给PDF文档添加文本与图片页眉C# 生转换网页为pdfC# 中使用iTextSharp组件创建PDF的简单方法C#将jpg转换为pdf的方法C#实现pdf导出 .Net导出pdf文件用C#来解析PDF文件c#开发word批量转pdf源码分享c#实现pdf的另存为功能C#在PDF中绘制不同风格类型的文本方法实例