java爬虫

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import java.net.*;
import java.io.*;
import java.util.regex.*;
import java.util.*;
//import java.util.Queue;
//import java.util.LinkedList;

public class Spider {
public static void main(String args[]) throws Exception {
getMail();
}

public static void getMail() throws Exception {
URL url = new URL("http://tieba.baidu.com/p/3856188322");
Queue<URL> q = new LinkedList<URL>();
HashMap hm = new HashMap();
hm.put(url, 1);
q.add(url);
while ((url = q.poll()) != null) {
try {
URLConnection con = url.openConnection();
BufferedReader bufin = new BufferedReader(
new InputStreamReader(con.getInputStream()));
} catch (Exception e) {
continue;
}
URLConnection con = url.openConnection();
BufferedReader bufin = new BufferedReader(new InputStreamReader(
con.getInputStream()));
String line = null;
String mailreg = "\\w+@\\w+[\\.\\w+]+";
String urlreg = "http.*?\"";
Pattern p = Pattern.compile(mailreg);
Pattern purl = Pattern.compile(urlreg);
while ((line = bufin.readLine()) != null) {
Matcher m = p.matcher(line);
Matcher murl = purl.matcher(line);
while (m.find()) {
System.out.println(m.group());
}
while (murl.find()) {
String tmp = murl.group().toString();
tmp = tmp.replaceAll("\"", "");
boolean flag = false;
try {
URL ttmp = new URL(tmp);
} catch (Exception e) {
flag = true;
continue;
}
URL ttmp = new URL(tmp);
if(!hm.containsKey(ttmp))
q.add(ttmp);
//System.out.println(tmp);
}

}
}
}
}