1 public class QueryTask extends Task {
4 DistributedHashMap results;
6 GlobalString workingURL;
8 public QueryTask(Queue todoList, DistributedHashMap doneList, int maxDepth, DistributedHashMap results) {
9 this.todoList = todoList;
10 this.doneList = doneList;
11 this.maxDepth = maxDepth;
12 this.results = results;
13 toprocess = global new Queue();
16 public void execute() {
21 depth = ((GlobalQuery)myWork).getDepth();
26 /* global variables */
36 gq = (GlobalQuery)myWork;
37 hostname = new String(GlobalString.toLocalCharArray(gq.getHostName()));
38 path = new String(GlobalString.toLocalCharArray(gq.getPath()));
40 GlobalStringBuffer gsb = global new GlobalStringBuffer(hostname);
43 workingURL = global new GlobalString(gsb.toGlobalString());
46 lq = new LocalQuery(hostname, path, depth);
48 System.printString("["+lq.getDepth()+"] ");
49 System.printString("Processing - Hostname : ");
50 System.printString(hostname);
51 System.printString(", Path : ");
52 System.printString(path);
53 System.printString("\n");
55 if (isDocument(path)) {
59 Socket s = new Socket();
61 if(s.connect(hostname, 80) == -1) {
65 requestQuery(hostname, path, s);
68 if ((title = grabTitle(lq)) != null) {
70 gTitle = global new GlobalString(title);
73 toprocess = processPage(lq);
80 public static boolean isDocument(String str) {
81 int index = str.lastindexOf('.');
84 if ((str.subString(index+1)).equals("pdf")) return true;
85 else if ((str.subString(index+1)).equals("ps")) return true;
86 else if ((str.subString(index+1)).equals("ppt")) return true;
87 else if ((str.subString(index+1)).equals("pptx")) return true;
88 else if ((str.subString(index+1)).equals("jpg")) return true;
89 else if ((str.subString(index+1)).equals("mp3")) return true;
90 else if ((str.subString(index+1)).equals("wmv")) return true;
91 else if ((str.subString(index+1)).equals("doc")) return true;
92 else if ((str.subString(index+1)).equals("docx")) return true;
93 else if ((str.subString(index+1)).equals("mov")) return true;
94 else if ((str.subString(index+1)).equals("flv")) return true;
100 public void done(Object obj) {
101 if ((gTitle != null) && (gTitle.length() > 0)) {
105 while(!toprocess.isEmpty()) {
106 GlobalQuery q = (GlobalQuery)toprocess.pop();
108 GlobalString hostname = global new GlobalString(q.getHostName());
109 GlobalString path = global new GlobalString(q.getPath());
111 GlobalStringBuffer gsb = global new GlobalStringBuffer(hostname);
115 if (!doneList.containsKey(gsb.toGlobalString())) {
118 GlobalString str = global new GlobalString("1");
119 doneList.put(gsb.toGlobalString(), str);
124 public static String grabTitle(LocalQuery lq) {
125 String sBrace = new String("<");
126 String strTitle = new String("title>");
127 String searchstr = lq.response.toString();
136 for (i = 0; i < searchstr.length(); i++) {
137 if (searchstr.charAt(i) == '<') {
139 if (searchstr.length() > (i+strTitle.length())) {
140 tmp = searchstr.subString(i, i+strTitle.length());
141 if (tmp.equalsIgnoreCase("title>")) {
142 mindex = i + tmp.length();
143 for (j = mindex; j < searchstr.length(); j++) {
144 if (searchstr.charAt(j) == '<') {
146 tmp = searchstr.subString(j, j+strTitle.length()+1);
147 if (tmp.equalsIgnoreCase("/title>")) {
159 title = searchstr.subString(mindex, endquote);
160 if (Character.isWhitespace(title.charAt(0))){
162 while (Character.isWhitespace(title.charAt(mindex++)));
164 if (mindex >= title.length()) return null;
165 title = new String(title.subString(mindex));
168 if (Character.isWhitespace(title.charAt(title.length()-1))) {
169 endquote=title.length()-1;
170 while (Character.isWhitespace(title.charAt(endquote--)));
172 if (mindex >= endquote) return null;
173 title = new String(title.subString(0, endquote));
176 if (isErrorPage(title)) {
180 // System.out.println("Title = [" + title + "]");
185 public static boolean isErrorPage(String str) {
186 if (str.equals("301 Moved Permanently"))
188 else if (str.equals("302 Found"))
190 else if (str.equals("404 Not Found"))
192 else if (str.equals("403 Forbidden"))
194 else if (str.equals("404 File Not Found"))
200 public static void requestQuery(String hostname, String path, Socket sock) {
201 StringBuffer req = new StringBuffer("GET ");
204 req.append(" HTTP/1.0\r\nHost: ");
205 req.append(hostname);
206 req.append("\r\n\r\n");
207 sock.write(req.toString().getBytes());
210 public static void readResponse(LocalQuery lq, Socket sock) {
215 // state 4 - \r\n\r\n
216 byte[] buffer = new byte[1024];
220 numchars = sock.read(buffer);
222 String curr = (new String(buffer)).subString(0, numchars);
224 lq.response.append(curr);
225 buffer = new byte[1024];
226 } while(numchars > 0);
230 public static void readResponse(LocalQuery lq, Socket sock) {
235 // state 4 - \r\n\r\n
240 byte[] b=new byte[1];
241 int numchars=sock.read(b);
248 } else if (state==1) {
249 byte[] b=new byte[1];
250 int numchars=sock.read(b);
257 } else if (state==2) {
258 byte[] b=new byte[1];
259 int numchars=sock.read(b);
266 } else if (state==3) {
267 byte[] b=new byte[1];
268 int numchars=sock.read(b);
277 byte[] buffer=new byte[1024];
278 int numchars=sock.read(buffer);
282 String curr=(new String(buffer)).subString(0,numchars);
283 // System.out.println("numchars = "+numchars);
284 lq.response.append(curr);
290 public void processList() {
292 GlobalString token = null;
296 while (endquote != -1) {
297 endquote = gTitle.indexOf(' ', mindex);
299 if (endquote != -1) {
300 token = gTitle.subString(mindex, endquote);
301 mindex = endquote + 1;
305 token = refine(token);
308 token = gTitle.subString(mindex);
309 token = refine(token);
312 Queue q = (Queue)results.get(token);
314 q = global new Queue();
317 results.put(token, q);
318 // System.out.println("Key : ["+token.toLocalString()+"],["+q.size()+"]");
322 public boolean filter(GlobalString str) {
323 if (str.equals("of")) return true;
324 else if (str.equals("for")) return true;
325 else if (str.equals("a")) return true;
326 else if (str.equals("an")) return true;
327 else if (str.equals("the")) return true;
328 else if (str.equals("at")) return true;
329 else if (str.equals("and")) return true;
330 else if (str.equals("or")) return true;
331 else if (str.equals("but")) return true;
332 else if (str.equals("to")) return true;
333 else if (str.equals("The")) return true;
334 else if (str.length() == 1) {
335 if (str.charAt(0) == '.') return true;
336 else if (str.charAt(0) == '.') return true;
337 else if (str.charAt(0) == '-') return true;
338 else if (str.charAt(0) == '=') return true;
339 else if (str.charAt(0) == '_') return true;
340 else if (str.charAt(0) == ':') return true;
341 else if (str.charAt(0) == ';') return true;
342 else if (str.charAt(0) == '\'') return true;
343 else if (str.charAt(0) == '\"') return true;
344 else if (str.charAt(0) == '|') return true;
345 else if (str.charAt(0) == '@') return true;
346 else if (str.charAt(0) == '&') return true;
347 else if (str.charAt(0) == ' ') return true;
352 public GlobalString refine(GlobalString str) {
353 str = refinePrefix(str);
354 str = refinePostfix(str);
358 public GlobalString refinePrefix(GlobalString str) {
359 if (str.charAt(0) == '&') { // &
360 return str.subString(1);
362 else if (str.charAt(0) == '/') { // &
363 return str.subString(1);
368 public GlobalString refinePostfix(GlobalString str) {
369 if (str.charAt(str.length()-1) == ',') { // ,
370 return str.subString(0, str.length()-1);
372 else if (str.charAt(str.length()-1) == ':') { // :
373 return str.subString(0, str.length()-1);
375 else if (str.charAt(str.length()-1) == ';') { // ;
376 return str.subString(0, str.length()-1);
378 else if (str.charAt(str.length()-1) == '!') { // !
379 return str.subString(0, str.length()-1);
381 else if (str.charAt(str.length()-1) == 's') { // 's
382 if (str.charAt(str.length()-2) == '\'')
383 return str.subString(0, str.length()-2);
385 else if (str.charAt(str.length()-1) == '-') {
386 int index = str.length()-2;
387 while (Character.isWhitespace(str.charAt(index--)));
388 return str.subString(0, index+2);
393 public static Queue processPage(LocalQuery lq) {
395 String href = new String("href=\"");
396 String searchstr = lq.response.toString();
401 depth = lq.getDepth() + 1;
403 toprocess = global new Queue();
405 int mindex = searchstr.indexOf(href,index);
407 int endquote = searchstr.indexOf('"', mindex+href.length());
408 if (endquote != -1) {
409 String match = searchstr.subString(mindex+href.length(), endquote);
410 String match2 = lq.makewebcanonical(match);
412 GlobalString ghostname;
415 ghostname = global new GlobalString(lq.getHostName(match));
416 gpath = global new GlobalString(lq.getPathName(match));
418 if (match2 != null) {
419 GlobalQuery gq = global new GlobalQuery(ghostname, gpath, depth);