d8b9a12147bde8f266080e16a511776694b7db64
[IRC.git] / Robust / src / Benchmarks / Spider / recovery / QueryTask.java
1 public class QueryTask extends Task {
2         int maxDepth;
3         Queue toprocess;
4         DistributedHashMap results;
5         GlobalString gTitle;
6         GlobalString workingURL;
7
8   public QueryTask(Queue todoList, DistributedHashMap doneList, int maxDepth, DistributedHashMap results) {
9     this.todoList = todoList;
10                 this.doneList = doneList;
11                 this.maxDepth = maxDepth;
12                 this.results = results;
13                 toprocess = global new Queue();
14   }
15
16   public void execute() {
17                 int depth;
18                 int max;
19                 
20                 atomic {
21                         depth = ((GlobalQuery)myWork).getDepth();
22       max = this.maxDepth;
23                 }
24
25                 if (depth < max) {
26                         /* global variables */
27                         GlobalQuery gq;
28
29                         /* local variables */
30                         LocalQuery lq;
31                         String hostname;
32                         String path;
33                         String title;
34
35                         atomic {
36                                 gq = (GlobalQuery)myWork;
37                                 hostname = new String(GlobalString.toLocalCharArray(gq.getHostName()));
38                                 path = new String(GlobalString.toLocalCharArray(gq.getPath()));
39
40                                 GlobalStringBuffer gsb = global new GlobalStringBuffer(hostname);
41                                 gsb.append("/");
42                                 gsb.append(path);
43                                 workingURL = global new GlobalString(gsb.toGlobalString());
44                                 gTitle = null;
45                         }
46                         lq = new LocalQuery(hostname, path, depth);
47
48                         System.printString("["+lq.getDepth()+"] ");
49                         System.printString("Processing - Hostname : ");
50                         System.printString(hostname);
51                         System.printString(", Path : ");
52                         System.printString(path);
53                         System.printString("\n");
54
55                         if (isDocument(path)) {
56                                 return;
57                         }
58
59                         Socket s = new Socket();
60
61                         if(s.connect(hostname, 80) == -1) {
62                                 return;
63                         }
64
65                         requestQuery(hostname, path, s);
66                         readResponse(lq, s);
67
68                         if ((title = grabTitle(lq)) != null) {
69                                 atomic {
70                                         gTitle = global new GlobalString(title);
71                                 }
72                                 atomic {
73                                         toprocess = processPage(lq);
74                                 }
75                         }
76                         s.close();
77                 }
78   }
79         
80         public static boolean isDocument(String str) {
81                 int index = str.lastindexOf('.');
82
83                 if (index != -1) {
84                         if ((str.subString(index+1)).equals("pdf")) return true;
85                         else if ((str.subString(index+1)).equals("ps")) return true;
86                         else if ((str.subString(index+1)).equals("ppt")) return true;
87                         else if ((str.subString(index+1)).equals("pptx")) return true;
88                         else if ((str.subString(index+1)).equals("jpg")) return true;
89                         else if ((str.subString(index+1)).equals("mp3")) return true;
90                         else if ((str.subString(index+1)).equals("wmv")) return true;
91                         else if ((str.subString(index+1)).equals("doc")) return true;
92                         else if ((str.subString(index+1)).equals("docx")) return true;
93                         else if ((str.subString(index+1)).equals("mov")) return true;
94                         else if ((str.subString(index+1)).equals("flv")) return true;
95                         else return false;
96                 }
97                 return false;
98         }
99
100         public void done(Object obj) {
101                 if ((gTitle != null) && (gTitle.length() > 0)) {
102                         processList();
103                 }
104
105                 while(!toprocess.isEmpty()) {
106                         GlobalQuery q = (GlobalQuery)toprocess.pop();
107
108                         GlobalString hostname = global new GlobalString(q.getHostName());
109                         GlobalString path = global new GlobalString(q.getPath());
110
111                         GlobalStringBuffer gsb = global new GlobalStringBuffer(hostname);
112                         gsb.append("/");
113                         gsb.append(path);
114
115                         if (!doneList.containsKey(gsb.toGlobalString())) {
116                                 todoList.push(q);
117                                         
118                                 GlobalString str = global new GlobalString("1");
119                                 doneList.put(gsb.toGlobalString(), str);
120                         }
121                 }
122         }
123
124         public static String grabTitle(LocalQuery lq) {
125                 String sBrace = new String("<");        
126                 String strTitle = new String("title>");
127         String searchstr = lq.response.toString();
128                 String title = null;
129                 char ch;
130
131                 int mindex = -1;
132                 int endquote = -1;
133                 int i, j;
134                 String tmp;
135
136                 for (i = 0; i < searchstr.length(); i++) {
137                         if (searchstr.charAt(i) == '<') {
138                                 i++;
139                                 if (searchstr.length() > (i+strTitle.length())) {
140                                         tmp = searchstr.subString(i, i+strTitle.length());
141                                         if (tmp.equalsIgnoreCase("title>")) {
142                                                 mindex = i + tmp.length();
143                                                 for (j = mindex; j < searchstr.length(); j++) {
144                                                         if (searchstr.charAt(j) == '<') {
145                                                                 j++;
146                                                                 tmp = searchstr.subString(j, j+strTitle.length()+1);                    
147                                                                 if (tmp.equalsIgnoreCase("/title>")) {
148                                                                         endquote = j - 1;
149                                                                         break;
150                                                                 }
151                                                         }
152                                                 }
153                                         }
154                                 }
155                         }
156                 }
157
158                 if (mindex != -1) {
159                         title = searchstr.subString(mindex, endquote);
160                         if (Character.isWhitespace(title.charAt(0))){
161                                 mindex=0;
162                                 while (Character.isWhitespace(title.charAt(mindex++)));
163                                 mindex--;
164                                 if (mindex >= title.length()) return null;
165                                 title = new String(title.subString(mindex));
166                         }
167
168                         if (Character.isWhitespace(title.charAt(title.length()-1))) {
169                                 endquote=title.length()-1;
170                                 while (Character.isWhitespace(title.charAt(endquote--)));
171                                 endquote += 2;
172                                 if (mindex >= endquote) return null;
173                                 title = new String(title.subString(0, endquote));
174                         }
175
176                         if (isErrorPage(title)) {
177                                 return null;
178                         }
179                 }
180 //              System.out.println("Title = [" + title + "]");
181
182                 return title;
183         }
184
185         public static boolean isErrorPage(String str) { 
186                 if (str.equals("301 Moved Permanently")) 
187                         return true;
188                 else if (str.equals("302 Found")) 
189                         return true;
190                 else if (str.equals("404 Not Found")) 
191                         return true;
192                 else if (str.equals("403 Forbidden")) 
193                         return true;
194                 else if (str.equals("404 File Not Found")) 
195                         return true;
196                 else
197                         return false;
198         }
199
200         public static void requestQuery(String hostname, String path, Socket sock) {
201     StringBuffer req = new StringBuffer("GET "); 
202     req.append("/");
203                 req.append(path);
204           req.append(" HTTP/1.0\r\nHost: ");
205     req.append(hostname);
206     req.append("\r\n\r\n");
207     sock.write(req.toString().getBytes());
208   }
209
210         public static void readResponse(LocalQuery lq, Socket sock) {
211         //    state 0 - nothing
212         //    state 1 - \r
213         //    state 2 - \r\n
214         //    state 3 - \r\n\r
215         //    state 4 - \r\n\r\n
216                 byte[] buffer = new byte[1024];
217                 int numchars;
218
219                 do {
220                         numchars = sock.read(buffer);
221
222                         String curr = (new String(buffer)).subString(0, numchars);
223                         
224                         lq.response.append(curr);
225                         buffer = new byte[1024];
226                 } while(numchars > 0);
227   }
228
229 /*
230         public static void readResponse(LocalQuery lq, Socket sock) {
231         //    state 0 - nothing
232         //    state 1 - \r
233         //    state 2 - \r\n
234         //    state 3 - \r\n\r
235         //    state 4 - \r\n\r\n
236     int state=0;
237     while(true) {
238       if (state<4) {
239         if (state==0) {
240           byte[] b=new byte[1];
241           int numchars=sock.read(b);
242           if ((numchars==1)) {
243             if (b[0]=='\r') {
244               state++;
245             }
246           } else
247                                                 return;
248         } else if (state==1) {
249           byte[] b=new byte[1];
250           int numchars=sock.read(b);
251           if (numchars==1) {
252             if (b[0]=='\n')
253               state++;
254             else
255               state=0;
256           } else return;
257         } else if (state==2) {
258           byte[] b=new byte[1];
259           int numchars=sock.read(b);
260           if (numchars==1) {
261             if (b[0]=='\r')
262               state++;
263             else
264               state=0;
265           } else return;
266         } else if (state==3) {
267           byte[] b=new byte[1];
268           int numchars=sock.read(b);
269           if (numchars==1) {
270             if (b[0]=='\n')
271               state++;
272             else
273               state=0;
274           } else return;
275         }
276       } else {
277                                 byte[] buffer=new byte[1024];
278         int numchars=sock.read(buffer);
279         if (numchars==0)
280           return;
281         else {
282           String curr=(new String(buffer)).subString(0,numchars);
283 //                                      System.out.println("numchars = "+numchars);
284                                         lq.response.append(curr);
285         }
286       }
287     }
288   }
289 */
290         public void processList() {
291                 LinkedList ll;
292                 GlobalString token = null;
293                 int mindex = 0;
294                 int endquote = 0;
295
296                 while (endquote != -1) {
297                         endquote = gTitle.indexOf(' ', mindex);
298
299                         if (endquote != -1) {
300                                 token = gTitle.subString(mindex, endquote);
301                                 mindex = endquote + 1;
302                                 if (filter(token)) {
303                                         continue;
304                                 }
305                                 token = refine(token);
306                         }
307                         else {
308                                 token = gTitle.subString(mindex);
309                                 token = refine(token);
310                         }
311
312                         Queue q = (Queue)results.get(token);
313                         if (q == null) {
314                                 q = global new Queue();
315                         }
316                         q.push(workingURL);     
317                         results.put(token, q);
318 //                      System.out.println("Key : ["+token.toLocalString()+"],["+q.size()+"]");
319                 }
320         }
321
322         public boolean filter(GlobalString str) {
323                 if (str.equals("of"))   return true;
324                 else if (str.equals("for")) return true;
325                 else if (str.equals("a")) return true;
326                 else if (str.equals("an")) return true;
327                 else if (str.equals("the")) return true;
328                 else if (str.equals("at")) return true;
329                 else if (str.equals("and")) return true;
330                 else if (str.equals("or")) return true;
331                 else if (str.equals("but")) return true;
332                 else if (str.equals("to")) return true;
333                 else if (str.equals("The")) return true;
334                 else if (str.length() == 1) {
335                         if (str.charAt(0) == '.') return true;
336                         else if (str.charAt(0) == '.') return true;
337                         else if (str.charAt(0) == '-') return true;
338                         else if (str.charAt(0) == '=') return true;
339                         else if (str.charAt(0) == '_') return true;
340                         else if (str.charAt(0) == ':') return true;
341                         else if (str.charAt(0) == ';') return true;
342                         else if (str.charAt(0) == '\'') return true;
343                         else if (str.charAt(0) == '\"') return true;
344                         else if (str.charAt(0) == '|') return true;
345                         else if (str.charAt(0) == '@') return true;
346                         else if (str.charAt(0) == '&') return true;
347                         else if (str.charAt(0) == ' ') return true;
348                 }
349                 else return false;
350         }
351
352         public GlobalString refine(GlobalString str) {
353                 str = refinePrefix(str);
354                 str = refinePostfix(str);
355                 return str;
356         }
357
358         public GlobalString refinePrefix(GlobalString str) {
359                 if (str.charAt(0) == '&') {             // &
360                         return str.subString(1);
361                 }
362                 else if (str.charAt(0) == '/') {                // &
363                         return str.subString(1);
364                 }
365                 return str;
366         }
367
368         public GlobalString refinePostfix(GlobalString str) {
369                 if (str.charAt(str.length()-1) == ',') {                        // ,
370                         return str.subString(0, str.length()-1);
371                 }
372                 else if (str.charAt(str.length()-1) == ':') {           // :
373                         return str.subString(0, str.length()-1);
374                 }
375                 else if (str.charAt(str.length()-1) == ';') {           // ;
376                         return str.subString(0, str.length()-1);
377                 }
378                 else if (str.charAt(str.length()-1) == '!') {           // !
379                         return str.subString(0, str.length()-1);
380                 }
381                 else if (str.charAt(str.length()-1) == 's') {                   // 's
382                         if (str.charAt(str.length()-2) == '\'')
383                                 return str.subString(0, str.length()-2);        
384                 }
385                 else if (str.charAt(str.length()-1) == '-') {
386                         int index = str.length()-2;
387                         while (Character.isWhitespace(str.charAt(index--)));
388                         return str.subString(0, index+2);
389                 }
390                 return str;
391         }
392         
393   public static Queue processPage(LocalQuery lq) {
394     int index = 0;
395         String href = new String("href=\"");
396         String searchstr = lq.response.toString();
397                 int depth;
398         boolean cont = true;
399                 Queue toprocess;
400
401                 depth = lq.getDepth() + 1;
402
403                 toprocess = global new Queue();
404                 while(cont) {
405                         int mindex = searchstr.indexOf(href,index);
406                         if (mindex != -1) {     
407                                 int endquote = searchstr.indexOf('"', mindex+href.length());
408                 if (endquote != -1) {
409                       String match = searchstr.subString(mindex+href.length(), endquote);
410                                         String match2 = lq.makewebcanonical(match);
411         
412                                         GlobalString ghostname;
413                                         GlobalString gpath;
414
415                                         ghostname = global new GlobalString(lq.getHostName(match));
416                                         gpath = global new GlobalString(lq.getPathName(match));
417
418                       if (match2 != null) {
419                                                         GlobalQuery gq = global new GlobalQuery(ghostname, gpath, depth);
420                                                         toprocess.push(gq);
421                                         }
422                                         index = endquote;
423         } else cont = false;
424       } else cont = false;
425     }
426                 return toprocess;
427   }
428 }