當大象遇上PDFBox... 這個標題看起來蠻有趣的,相反的當「Hadoop + PDFBox」就太正式了.. XD
兩個月前筆者曾po「PDFBox - 擷取PDF檔案中的純文字」,現在一樣請多台幾器一起來做這件事~ 如果沒機會體驗的話~ 看看「Self-service, Prorated Super Computing Fun!」這篇描述NYT在兩年前用Hadoop將1100萬份文章的TIFF影像檔轉成PDF檔案,重點在於只花了一天的時間就搞定了... = =" 而本文要做的就是分散式的將這些PDF檔案擷取出純文字~ 當然會比一台機器快多了~ (不過話說我也是在一台機器上測試...)
P.S. third party library 請記得放在「lib」資料夾一同打包
import java.io.ByteArrayInputStream; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.lib.NullOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.util.PDFTextStripper; public class PDF2TXT extends Configured implements Tool { public static class Map extends MapReduceBase implements Mapper<NullWritable, BytesWritable, Text, Text> { private JobConf conf; @Override public void configure(JobConf conf) { this.conf = conf; } public void map(NullWritable key, BytesWritable value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { String filename = conf.get("map.input.file"); String output_dir = conf.get("output.dir"); filename = getFileName(filename); FileSystem fs = FileSystem.get(conf); FSDataOutputStream dos = fs.create(new Path(output_dir + filename + ".txt")); PDDocument document = PDDocument.load(new ByteArrayInputStream(value.getBytes())); PDFTextStripper stripper = new PDFTextStripper(); stripper.setStartPage(1); stripper.setEndPage(document.getNumberOfPages()); String s = stripper.getText(document); dos.write(s.getBytes("UTF-8")); dos.close(); } public String getFileName(String s) { return s.substring(s.lastIndexOf("/"), s.lastIndexOf(".")); } } public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), PDF2TXT.class); conf.set("output.dir", args[1]); conf.setJobName("PDF2TXT"); conf.setMapperClass(Map.class); conf.setInputFormat(WholeFileInputFormat.class); conf.setOutputFormat(NullOutputFormat.class); conf.set("mapred.child.java.opts", "-Xmx256m"); conf.setNumReduceTasks(0); WholeFileInputFormat.setInputPaths(conf, new Path(args[0])); JobClient.runJob(conf); return 0; } public static void main(String[] args) { try { int res = ToolRunner.run(new Configuration(), new PDF2TXT(), args); System.exit(res); } catch (Exception e) { e.printStackTrace(); } } }
.原始檔