Java版PageRank及网站收录情况查询代码-白红宇

Java版PageRank及网站收录情况查询代码

阅读量：2200 次

发布时间：2019-05-03

本文共 16080 字，大约阅读时间需要 53 分钟。

在Google这个由10的100次方得名的站点中，各种评估网站的算法层出不穷，而PageRank即是其中之一。

Google的PageRank根据网站的外部链接和内部链接的数量和质量俩衡量网站的价值。PageRank背后的概念是，每个到页面的链接都是对该页面的一次投票，被链接的越多，就意味着被其他网站投票越多。这个就是所谓的“链接流行度”——衡量多少人愿意将他们的网站和你的网站挂钩。PageRank这个概念引自学术中一篇论文的被引述的频度——即被别人引述的次数越多，一般判断这篇论文的权威性就越高。

通常情况下讲，原创内容越多的站点，PageRank越容易提升，反之则相对比较困难，PageRank最大上限值为10。在Google的评估中，能上10的网站真可谓凤毛麟角，即使算上Google，能成就PageRank 10这“伟业”者，望眼环球也不足40家。一般来说，个人站点评估值4即办的不错，商业网站到6以上便算步入正轨了。

网上虽然有不少现成的查询器及源码，但是光用别人的毕竟不符合程序员风格，所以今天自己用Java重造轮子又写了个PageRank查询实现，捎带着把一些常用搜索引擎的网站链接及反向链接查询也加上了。

源码如下：

GooglePageRank.java

package org.loon.test;
import java.io.IOException;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
 * Copyright 2008
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 * 
 * @project loonframework
 * @author chenpeng
 * @email：ceponline@yahoo.com.cn
 * @version 0.1
 */
publicclass GooglePageRank { 
    // google pagerank服务器ip地址列表（最近google小气了很多，反复查询一个封ip）
    final static String[] GoogleServiceIP = new String[] { "64.233.161.100",
            "64.233.161.101", "64.233.183.91", "64.233.189.44", "66.102.1.103",
            "66.102.9.115", "66.249.89.83", "66.249.91.99", "66.249.93.190" };
    // google用识别标记
    final static private int GOOGLE_MAGIC = 0xE6359A60;
    // ch数值混合器
privateclass CHMix { 
        int a;
        int b;
        int c;
public CHMix() { 
            this(0, 0, 0);
        }
public CHMix(int a, int b, int c) { 
            this.a = a;
            this.b = b;
            this.c = c;
        }
    }
    /**
     * 按google要求混合成ch数据
     * 
     * @param mix
     */
privatestaticvoid mix(final CHMix mix) { 
        mix.a -= mix.b;
        mix.a -= mix.c;
        mix.a ^= mix.c >> 13;
        mix.b -= mix.c;
        mix.b -= mix.a;
        mix.b ^= mix.a << 8;
        mix.c -= mix.a;
        mix.c -= mix.b;
        mix.c ^= mix.b >> 13;
        mix.a -= mix.b;
        mix.a -= mix.c;
        mix.a ^= mix.c >> 12;
        mix.b -= mix.c;
        mix.b -= mix.a;
        mix.b ^= mix.a << 16;
        mix.c -= mix.a;
        mix.c -= mix.b;
        mix.c ^= mix.b >> 5;
        mix.a -= mix.b;
        mix.a -= mix.c;
        mix.a ^= mix.c >> 3;
        mix.b -= mix.c;
        mix.b -= mix.a;
        mix.b ^= mix.a << 10;
        mix.c -= mix.a;
        mix.c -= mix.b;
        mix.c ^= mix.b >> 15;
    }
    /**
     * 获得ch数值混合器
     * 
     * @return
     */
publicstatic CHMix getInnerCHMix() { 
        return new GooglePageRank().new CHMix();
    }
    /**
     * 通过url获得googlech(google数据库针对页面的全球唯一标识)
     * 
     * @param url
     * @return
     */
publicstatic String GoogleCH(final String url) { 
        // 格式化为google要求的info:url模式
        String nUrl = String.format("info:%s", new Object[] { url });
        // 获得新url字符串格式
        char[] urls = nUrl.toCharArray();
        // 获得新url长度
        int length = urls.length;
        // 获得一个ch数值混合器
        CHMix chMix = GooglePageRank.getInnerCHMix();
        // 为c注入google识别标识
        chMix.c = GOOGLE_MAGIC;
        // 为a、b项注入google要求的初始标识
        chMix.a = chMix.b = 0x9E3779B9;
        int k = 0;
        int len = length;
while (len >= 12) { 
            chMix.a += (int) (urls[k + 0] + (urls[k + 1] << 8)
                    + (urls[k + 2] << 16) + (urls[k + 3] << 24));
            chMix.b += (int) (urls[k + 4] + (urls[k + 5] << 8)
                    + (urls[k + 6] << 16) + (urls[k + 7] << 24));
            chMix.c += (int) (urls[k + 8] + (urls[k + 9] << 8)
                    + (urls[k + 10] << 16) + (urls[k + 11] << 24));
            // 获得混合运算后的数据
            GooglePageRank.mix(chMix);
            k += 12;
            len -= 12;
        }
        chMix.c += length;
        // 产生googlech的11位标识
switch (len) { 
        case 11:
            chMix.c += (int) (urls[k + 10] << 24);
        case 10:
            chMix.c += (int) (urls[k + 9] << 16);
        case 9:
            chMix.c += (int) (urls[k + 8] << 8);
        case 8:
            chMix.b += (int) (urls[k + 7] << 24);
        case 7:
            chMix.b += (int) (urls[k + 6] << 16);
        case 6:
            chMix.b += (int) (urls[k + 5] << 8);
        case 5:
            chMix.b += (int) (urls[k + 4]);
        case 4:
            chMix.a += (int) (urls[k + 3] << 24);
        case 3:
            chMix.a += (int) (urls[k + 2] << 16);
        case 2:
            chMix.a += (int) (urls[k + 1] << 8);
        case 1:
            chMix.a += (int) (urls[k + 0]);
            break;
        default:
            break;
        }
        // 获得混合运算后的数据
        GooglePageRank.mix(chMix);
        // 获得未修订的CH
        String tch = String.valueOf(chMix.c);
        // 矫正差值后反馈正确CH
        return String
                .format("6%s", new Object[] { tch.length() < 10 ? ("-" + tch)
                        .intern() : tch });
    }
    /**
     * 正则匹配pagerank结果
     * 
     * @param value
     * @return
     */
privatestatic String MatchRank(final String value) { 
        Pattern pattern = Pattern.compile("Rank_1:[0-9]:([0-9]+)");
        Matcher matcher = pattern.matcher(value);
if (matcher.find()) { 
            return matcher.group(1);
        }
        return "0";
    }
    /**
     * 获得指定页面的google pagerank值
     * 
     * @param url
     * @return
     */
publicstatic String GooglePR(final String url) { 
        String rip = GoogleServiceIP[new Random()
                .nextInt(GoogleServiceIP.length)];
        return GooglePR(url, rip);
    }
    /**
     * 以指定的google服务器获得指定页面的google pagerank值
     * 
     * @param url
     * @param ip
     * @return
     */
publicstatic String GooglePR(final String url, final String ip) { 
        // 产生查询用唯一标识
        String checksum = GoogleCH(url);
        // 产生查询用url
        String queryUrl = String
                .format(
                        "http://%s/search?client=navclient-auto&ch=%s&features=Rank&q=info:%s",
                        new Object[] { ip, checksum, url });
        String response;
try { 
            response = SimpleWebClient.getRequestHttp(queryUrl);
        } catch (IOException e) { 
            response = "";
        }
if (response.length() == 0) { 
            return "0";
        } else { 
            return GooglePageRank.MatchRank(response);
        }
    }
}

SimpleWebClient.java

package org.loon.test;
import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;
import sun.misc.BASE64Encoder;
/**
 * Copyright 2008
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 * 
 * @project loonframework
 * @author chenpeng
 * @email：ceponline@yahoo.com.cn
 * @version 0.1
 */
publicclass SimpleWebClient { 
    /**
     * 向指定url发送请求并获得响应数据
     * 
     * @param urlString
     * @return
     * @throws IOException
     */
publicstatic String getRequestHttp(String urlString) throws IOException { 
        return getRequestHttp(urlString, "utf-8");
    }
    /**
     * 向指定url发送请求并获得响应数据
     * 
     * @param urlString
     * @param encoding
     * @return
     * @throws IOException
     */
    public static String getRequestHttp(String urlString, String encoding)
throws IOException { 
        return getRequestHttp(urlString, encoding, null, 5000);
    }
    /**
     * 向指定url发送请求并获得响应数据
     * 
     * @param urlString
     * @param encoding
     * @param parameter
     * @return
     * @throws IOException
     */
    public static String getRequestHttp(final String urlString,
            final String encoding, final Map parameter, final int timeout)
throws IOException { 
        String nURL = (urlString.startsWith("http://") || urlString
                .startsWith("https://")) ? urlString : ("http:" + urlString)
                .intern();
        String user = null;
        String password = null;
        String method = "GET";
        String post = null;
        String digest = null;
        String responseContent = "ERROR";
        boolean foundRedirect = false;
        Map headers = new HashMap();
if (parameter != null) { 
            Set entrySet = parameter.entrySet();
for (Iterator it = entrySet.iterator(); it.hasNext();) { 
                Entry header = (Entry) it.next();
                String key = (String) header.getKey();
                String value = (String) header.getValue();
if ("user".equals(key)) { 
                    user = value;
                } elseif ("pass".equals(key)) { 
                    password = value;
                } elseif ("method".equals(key)) { 
                    method = value;
                } elseif ("post".equals(key)) { 
                    post = value;
                } else { 
                    headers.put(key, value);
                }
            }
        }
        URL url = new URL(nURL);
if (user != null && password != null) { 
            BASE64Encoder base64 = new BASE64Encoder();
            digest = "Basic "
                    + base64.encode((user + ":" + password).getBytes());
        }
do { 
            HttpURLConnection urlConnection = (HttpURLConnection) url
                    .openConnection();
            // 添加访问授权
if (digest != null) { 
                urlConnection.setRequestProperty("Authorization", digest);
            }
            urlConnection.setDoOutput(true);
            urlConnection.setDoInput(true);
            urlConnection.setUseCaches(false);
            urlConnection.setInstanceFollowRedirects(false);
            urlConnection.setRequestMethod(method);
if (timeout > 0) { 
                urlConnection.setConnectTimeout(timeout);
            }
            //模拟http头文件
            urlConnection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0;)");
            urlConnection.setRequestProperty("Accept", "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/msword, application/vnd.ms-excel, application/vnd.ms-powerpoint, */*");
            //追加http头文件
            Set headersSet = headers.entrySet();
for (Iterator it = headersSet.iterator(); it.hasNext();) { 
                Entry entry = (Entry) it.next();
                urlConnection.setRequestProperty((String) entry.getKey(),
                        (String) entry.getValue());
            }
if (post != null) { 
                OutputStreamWriter outRemote = new OutputStreamWriter(
                        urlConnection.getOutputStream());
                outRemote.write(post);
                outRemote.flush();
            }
            // 获得响应状态
            int responseCode = urlConnection.getResponseCode();
            // 获得返回的数据长度
            int responseLength = urlConnection.getContentLength();
if (responseCode == 302) { 
                // 重定向
                String location = urlConnection.getHeaderField("Location");
                url = new URL(location);
                foundRedirect = true;
            } else { 
                BufferedInputStream in;
if (responseCode == 200 || responseCode == 201) { 
                    in = new BufferedInputStream(urlConnection.getInputStream());
                } else { 
                    in = new BufferedInputStream(urlConnection.getErrorStream());
                }
                int size = responseLength == -1 ? 4096 : responseLength;
if (encoding != null) { 
                    responseContent = SimpleWebClient.read(in, size, encoding);
                } else { 
                    ByteArrayOutputStream out = new ByteArrayOutputStream();
                    byte[] bytes = new byte[size];
                    int read;
while ((read = in.read(bytes)) >= 0) { 
                        out.write(bytes, 0, read);
                    }
                    responseContent = new String(out.toByteArray());
                    in.close();
                    out.close();
                }
                foundRedirect = false;
            }
            // 如果重定向则继续
        } while (foundRedirect);
        return responseContent;
    }
    /**
     * 转化InputStream为String
     * 
     * @param in
     * @param size
     * @return
     * @throws IOException
     */
    private static String read(final InputStream in, final int size,
final String encoding) throws IOException { 
        StringBuilder sbr = new StringBuilder();
        int nSize = size;
if (nSize == 0) { 
            nSize = 1;
        }
        char[] buffer = new char[nSize];
        int offset = 0;
        InputStreamReader isr = new InputStreamReader(in, encoding);
while ((offset = isr.read(buffer)) != -1) { 
            sbr.append(buffer, 0, offset);
        }
        in.close();
        isr.close();
        return sbr.toString();
    }
}

WebAppraise.java

package org.loon.test;
import java.io.IOException;
/**
 * Copyright 2008
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 * 
 * @project loonframework
 * @author chenpeng
 * @email：ceponline@yahoo.com.cn
 * @version 0.1
 */
publicclass WebAppraise { 
    private String googleSum;
    private String baiduSum;
    private String msnSum;
    private String altaVistaSum;
    private String allTheWebSum;
    private String yahooSum;
    private String testURL;
public WebAppraise(final String url) { 
if (url != null && !"".equals(url)) { 
            this.testURL = url.trim();
if (this.testURL.startsWith("http://")) { 
                this.testURL = this.testURL.substring(7);
            }
if (this.testURL.startsWith("https://")) { 
                this.testURL = this.testURL.substring(8);
            }
        } else { 
            throw new RuntimeException("url is NULL!");
        }
    }
    /**
     * 分析指定链接结果，并返回整型数值
     * 
     * @param searchURL
     * @param anchor
     * @param trail
     * @return
     */
    private static int getLinks(final String searchURL, final String anchor,
final String trail) { 
        int count = 0;
        String serverResponse;
try { 
            // 我国特色……
if (searchURL.startsWith("http://www.baidu.com")) { 
                // 永不离休的gb2312同志(-_-||)
                serverResponse = SimpleWebClient.getRequestHttp(searchURL,
                        "gb2312");
            } else { 
                serverResponse = SimpleWebClient.getRequestHttp(searchURL);
            }
        } catch (IOException e) { 
            serverResponse = e.getMessage();
        }
        int pos = serverResponse.indexOf(anchor);
if (pos > 1) { 
            serverResponse = serverResponse.substring(pos + anchor.length());
            pos = serverResponse.indexOf(trail);
            String value = serverResponse.substring(0, pos).trim();
            value = value.replace(",", "");
            value = value.replace(".", "");
            count = Integer.parseInt(value);
        }
        return count;
    }
public String getAllTheWebSite() { 
        return getAllTheWebSite(false);
    }
public String getAllTheWebSite(boolean isDomain) { 
try { 
            String allTheWeb;
if (isDomain) { 
                allTheWeb = "http://www.alltheweb.com/search?cat=web&cs=utf8&rys=0&itag=crv&_sb_lang=any&q=linkdomain%3A"
                        + this.testURL;
            } else { 
                allTheWeb = "http://www.alltheweb.com/search?cat=web&cs=utf-8&q=link%3Ahttp%3A%2F%2F"
                        + this.testURL + "&_sb_lang=any";
            }
            allTheWebSum = ""
                    + getLinks(allTheWeb, "<span class=/"ofSoMany/">",
                            "</span>");
        } catch (Exception ex) { 
            allTheWebSum = ex.getMessage();
        }
        return allTheWebSum;
    }
public String getAltaVistaSite() { 
        return getAltaVistaSite(false);
    }
public String getAltaVistaSite(boolean isDomain) { 
try { 
            String altaVista;
if (isDomain) { 
                altaVista = "http://www.altavista.com/web/results?itag=ody&q=link%3A"
                        + this.testURL + "&kgs=0&kls=0";
            } else { 
                altaVista = "http://www.altavista.com/web/results?itag=ody&kgs=0&kls=0&q=site%3A"
                        + this.testURL;
            }
            altaVistaSum = "" + getLinks(altaVista, "AltaVista found ", " ");
        } catch (Exception ex) { 
            altaVistaSum = ex.getMessage();
        }
        return altaVistaSum;
    }
public String getGooglePR() { 
        return GooglePageRank.GooglePR(this.testURL);
    }
public String getGoogleSite() { 
        return getGoogleSite(false);
    }
public String getGoogleSite(finalboolean isDomian) { 
try { 
            String google;
            // 反向链接
if (isDomian) { 
                google = "http://www.google.com/search?hl=en&q=link%3A"
                        + this.testURL;
            } else { 
                google = "http://www.google.com/search?hl=en&q=site%3A"
                        + this.testURL + "&btnG=Google+Search&aq=f&oq=";
            }
            googleSum = "" + getLinks(google, "about <b>", "</b>");
        } catch (Exception ex) { 
            googleSum = ex.getMessage();
        }
        return googleSum;
    }
public String getBaiduSite() { 
        return getBaiduSite(false);
    }
public String getBaiduSite(finalboolean isDomian) { 
try { 
            String baidu;
if (isDomian) { 
                baidu = "http://www.baidu.com/s?wd=domain%3A" + this.testURL
                        + "&cl=3";
            } else { 
                baidu = "http://www.baidu.com/s?wd=site%3A" + this.testURL;
            }
            baiduSum = "" + getLinks(baidu, "找到相关网页", "篇");
        } catch (Exception ex) { 
            String baidu;
if (isDomian) { 
                baidu = "http://www.baidu.com/s?wd=domain%3A" + this.testURL
                        + "&cl=3";
            } else { 
                baidu = "http://www.baidu.com/s?wd=site%3A" + this.testURL;
            }
            baiduSum = "" + getLinks(baidu, "找到相关网页约", "篇");
        }
        return baiduSum;
    }
public String getYahooSite() { 
        return getYahooSite(false);
    }
public String getYahooSite(finalboolean isDomian) { 
try { 
            String yahoo;
if (isDomian) { 
                yahoo = "http://sitemap.cn.yahoo.com/search?p=" + this.testURL
                        + "&bwm=i";
                yahooSum = "" + getLinks(yahoo, "<strong>", "</strong>");
            } else { 
                yahoo = "http://www.yahoo.cn/s?p=site%3A" + this.testURL
                        + "&pid=hp&v=web";
                yahooSum = "" + getLinks(yahoo, "找到相关网页约", "条");
            }
        } catch (Exception ex) { 
            yahooSum = ex.getMessage();
        }
        return yahooSum;
    }
public String getMsnSite() { 
        return getMsnSite(false);
    }
public String getMsnSite(boolean isDomain) { 
try { 
            String msn;
if (isDomain) { 
                msn = "http://cnweb.search.live.com/results.aspx?q=link%3A"
                        + this.testURL + "&mkt=zh-cn&scope=&FORM=LIVSO";
            } else { 
                msn = "http://cnweb.search.live.com/results.aspx?q=site%3A"
                        + this.testURL + "&go=&form=QBRE";
            }
            msnSum = "" + getLinks(msn, "共", "条搜索结果");
        } catch (Exception ex) { 
            msnSum = ex.getMessage();
        }
        return msnSum;
    }再分享一下我老师大神的人工智能教程吧。零基础！通俗易懂！风趣幽默！还带黄段子！希望你也加入到我们人工智能的队伍中来！ 

转载地址：http://teeub.baihongyu.com/

你可能感兴趣的文章