當大象遇上PDFBox... 這個標題看起來蠻有趣的,相反的當「Hadoop + PDFBox」就太正式了.. XD
兩個月前筆者曾po「PDFBox - 擷取PDF檔案中的純文字」,現在一樣請多台幾器一起來做這件事~ 如果沒機會體驗的話~ 看看「Self-service, Prorated Super Computing Fun!」這篇描述NYT在兩年前用Hadoop將1100萬份文章的TIFF影像檔轉成PDF檔案,重點在於只花了一天的時間就搞定了... = =" 而本文要做的就是分散式的將這些PDF檔案擷取出純文字~ 當然會比一台機器快多了~ (不過話說我也是在一台機器上測試...)
P.S. third party library 請記得放在「lib」資料夾一同打包
import java.io.ByteArrayInputStream;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;
public class PDF2TXT extends Configured implements Tool
{
public static class Map extends MapReduceBase implements
Mapper<NullWritable, BytesWritable, Text, Text>
{
private JobConf conf;
@Override
public void configure(JobConf conf)
{
this.conf = conf;
}
public void map(NullWritable key, BytesWritable value,
OutputCollector<Text, Text> output, Reporter reporter)
throws IOException
{
String filename = conf.get("map.input.file");
String output_dir = conf.get("output.dir");
filename = getFileName(filename);
FileSystem fs = FileSystem.get(conf);
FSDataOutputStream dos = fs.create(new Path(output_dir + filename + ".txt"));
PDDocument document = PDDocument.load(new ByteArrayInputStream(value.getBytes()));
PDFTextStripper stripper = new PDFTextStripper();
stripper.setStartPage(1);
stripper.setEndPage(document.getNumberOfPages());
String s = stripper.getText(document);
dos.write(s.getBytes("UTF-8"));
dos.close();
}
public String getFileName(String s)
{
return s.substring(s.lastIndexOf("/"), s.lastIndexOf("."));
}
}
public int run(String[] args) throws Exception
{
JobConf conf = new JobConf(getConf(), PDF2TXT.class);
conf.set("output.dir", args[1]);
conf.setJobName("PDF2TXT");
conf.setMapperClass(Map.class);
conf.setInputFormat(WholeFileInputFormat.class);
conf.setOutputFormat(NullOutputFormat.class);
conf.set("mapred.child.java.opts", "-Xmx256m");
conf.setNumReduceTasks(0);
WholeFileInputFormat.setInputPaths(conf, new Path(args[0]));
JobClient.runJob(conf);
return 0;
}
public static void main(String[] args)
{
try
{
int res = ToolRunner.run(new Configuration(), new PDF2TXT(), args);
System.exit(res);
} catch (Exception e)
{
e.printStackTrace();
}
}
}
.原始檔
