當大象遇上PDFBox...

當大象遇上PDFBox... 這個標題看起來蠻有趣的，相反的當「Hadoop + PDFBox」就太正式了.. XD

兩個月前筆者曾po「PDFBox - 擷取PDF檔案中的純文字」，現在一樣請多台幾器一起來做這件事~ 如果沒機會體驗的話~ 看看「Self-service, Prorated Super Computing Fun!」這篇描述NYT在兩年前用Hadoop將1100萬份文章的TIFF影像檔轉成PDF檔案，重點在於只花了一天的時間就搞定了... = =" 而本文要做的就是分散式的將這些PDF檔案擷取出純文字~ 當然會比一台機器快多了~ (不過話說我也是在一台機器上測試...)

P.S. third party library 請記得放在「lib」資料夾一同打包

import java.io.ByteArrayInputStream;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;

public class PDF2TXT extends Configured implements Tool
{

    public static class Map extends MapReduceBase implements
            Mapper<NullWritable, BytesWritable, Text, Text>
    {

        private JobConf conf;

        @Override
        public void configure(JobConf conf)
        {
            this.conf = conf;
        }

        public void map(NullWritable key, BytesWritable value,
                OutputCollector<Text, Text> output, Reporter reporter)
                throws IOException
        {
            String filename = conf.get("map.input.file");
            String output_dir = conf.get("output.dir");
            filename = getFileName(filename);

            FileSystem fs = FileSystem.get(conf);
            FSDataOutputStream dos = fs.create(new Path(output_dir + filename + ".txt"));
            PDDocument document = PDDocument.load(new ByteArrayInputStream(value.getBytes()));
            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setStartPage(1);
            stripper.setEndPage(document.getNumberOfPages());
            String s = stripper.getText(document);
            dos.write(s.getBytes("UTF-8"));
            dos.close();
        }

        public String getFileName(String s)
        {
            return s.substring(s.lastIndexOf("/"), s.lastIndexOf("."));
        }
    }

    public int run(String[] args) throws Exception
    {
        JobConf conf = new JobConf(getConf(), PDF2TXT.class);
        conf.set("output.dir", args[1]);

        conf.setJobName("PDF2TXT");
        conf.setMapperClass(Map.class);

        conf.setInputFormat(WholeFileInputFormat.class);
        conf.setOutputFormat(NullOutputFormat.class);

        conf.set("mapred.child.java.opts", "-Xmx256m");
        conf.setNumReduceTasks(0);

        WholeFileInputFormat.setInputPaths(conf, new Path(args[0]));
        JobClient.runJob(conf);
        return 0;
    }

    public static void main(String[] args)
    {
        try
        {
            int res = ToolRunner.run(new Configuration(), new PDF2TXT(), args);
            System.exit(res);
        } catch (Exception e)
        {
            e.printStackTrace();
        }
    }
}

．原始檔

當大象遇上PDFBox...

Leave a Comment

::: 搜尋 :::

::: 分類 :::

::: 最新文章 :::

::: 最新回應 :::

::: 訂閱 :::