blog.Ring.idv.tw

Average Length of URL?

Average Length of URL?


從過年前到目前為止~ 都一直和學弟忙於將以前所實作的東西要轉換到線上版,我還需要點時間呀~ ><"

而在轉換的過程之中~ 突然想到一個問題!! 那就是全世界URL的平均長度究竟約多長?

我想這個答案只有大型搜尋引擎(GoogleYahooCuil)能給出一個較接近的答案吧~

下述是一個簡單計算這樣結果的MapReduce小程式:

URLList

http://l.yimg.com/f/a/tw/ivychang/708971_020409_420x80_0202_yahoo-elite.swf
http://l.yimg.com/tw.yimg.com/a/tw/ivychang/712756_1231_1231new350_100.swf
http://l.yimg.com/tw.yimg.com/a/tw/erinlin/721493_0123_350x200.swf
http://www.kriesi.at/wp-content/themes/dark_rainbow/js/Particles.swf
http://tw.promo.yahoo.com/2008auction/shpticket/images/top.swf
http://l.yimg.com/tw.yimg.com/a/tw/fanny/658216_101508_420x80_4.swf
http://l.yimg.com/f/a/tw/vikii/606895_shopping_center_20090203r.swf
http://l.yimg.com/f/a/tw/hedy/697827_e3_hp_012109.swf
http://l.yimg.com/tw.yimg.com/a/tw/ivychang/708334_0120_350x200_certificate_081224.swf
http://l.yimg.com/tw.yimg.com/a/tw/ivychang/708334_0120_350x100_linux_080826.swf
http://www.ysed.org.tw/3rd_upLoad/4156/index.swf

URLAvgLength

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.Counters;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class URLAvgLength extends Configured implements Tool {

	static enum Counter {
		URL_COUNT
	}

	public static class Map extends MapReduceBase implements
			Mapper<LongWritable, Text, Text, IntWritable> {

		private final static Text word = new Text("Len");

		public void map(LongWritable key, Text value,
				OutputCollector<Text, IntWritable> output, Reporter reporter)
				throws IOException {

			String key2 = value.toString();
			reporter.incrCounter(Counter.URL_COUNT, 1);
			output.collect(word, new IntWritable(key2.length()));
		}
	}

	public static class Reduce extends MapReduceBase implements
			Reducer<Text, IntWritable, Text, IntWritable> {

		public void reduce(Text key, Iterator<IntWritable> values,
				OutputCollector<Text, IntWritable> output, Reporter reporter)
				throws IOException {

			int sum = 0;
			while (values.hasNext()) {
				sum += values.next().get();
			}

			output.collect(key, new IntWritable(sum));
		}
	}

	public int run(String[] args) throws Exception {
		String input = "/usr/Ring/urllist/*";
		String output = "/usr/Ring/urlavglen";
		JobConf conf = new JobConf(getConf(), URLAvgLength.class);
		FileSystem fs = FileSystem.get(conf);
		fs.delete(new Path(output), true);

		conf.setJobName("URLAvgLength");
		conf.setOutputKeyClass(Text.class);
		conf.setOutputValueClass(IntWritable.class);

		conf.setMapperClass(Map.class);
		conf.setCombinerClass(Reduce.class);
		conf.setReducerClass(Reduce.class);

		conf.setInputFormat(TextInputFormat.class);
		conf.setOutputFormat(TextOutputFormat.class);

		conf.setNumReduceTasks(1);

		TextInputFormat.setInputPaths(conf, new Path(input));
		TextOutputFormat.setOutputPath(conf, new Path(output));

		RunningJob running = JobClient.runJob(conf);
		Counters ct = running.getCounters();
		long count = ct.getCounter(Counter.URL_COUNT);

		InputStream in = fs.open(new Path("hdfs://localhost:9000"+output+"/part-00000"));
		BufferedReader br = new BufferedReader(new InputStreamReader(in));
		String line = br.readLine();
		Integer value = Integer.parseInt(line.split("\t")[1]);
		System.out.println("Avg:" + value/count);
		return 0;
	}

	public static void main(String[] args) {
		try {
			int res = ToolRunner.run(new Configuration(), new URLAvgLength(),args);
			System.exit(res);
		} catch (Exception e) {
			e.printStackTrace();
		}

	}
}
Avg:67

2009-02-07 02:29:48

1 comments on "Average Length of URL?"

  1. 1. doris 說:

    哈哈,我也在思考这个问题。
    “全世界URL的平均長度究竟約多長?”

    2009-06-26 11:11:02

Leave a Comment

Copyright (C) Ching-Shen Chen. All rights reserved.

::: 搜尋 :::

::: 分類 :::

::: Ads :::

::: 最新文章 :::

::: 最新回應 :::

::: 訂閱 :::

Atom feed
Atom Comment