-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathWebScrap.java
More file actions
70 lines (50 loc) · 1.77 KB
/
WebScrap.java
File metadata and controls
70 lines (50 loc) · 1.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package webscrap;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
* @author admin
*/
public class WebScrap {
public static void main(String[] args) {
try{
String pathfile;
pathfile = "E:/new.txt";
BufferedReader reader=new BufferedReader(new FileReader(pathfile));
String line="";
while((line=reader.readLine())!=null){
System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>"+line);
URL url = new URL(line);
URLConnection conn = url.openConnection();
BufferedReader readurl =new BufferedReader(new InputStreamReader(conn.getInputStream()));
String lineurl="";
StringBuilder content = new StringBuilder();
while((lineurl= readurl.readLine()) !=null) {
content.append(lineurl).append("\n");
}String regEx="<a href=\"(.*?)\" class=\"track\"(.*?)\">\\n\\s(.*?)</span>\\n(.*?)<h5>(.*?)</h5>";
Pattern pattern = Pattern.compile(regEx);
Matcher matcher = pattern.matcher(content.toString());
while(matcher.find()){
String link=(matcher.group(1));
System.out.println(" LINK ==> " + link);
String title =matcher.group(5);
System.out.println("TITLE :==> " + title);
}
readurl.close();
}
}catch(IOException ioe){
System.out.println(ioe.getMessage());
}
}
}