8064a62471dd43f88e66ae544f2c4afc98dc35f1
[IRC.git] / Robust / src / Benchmarks / Recovery / Spider / QueryTask.java
1 public class QueryTask extends Task {
2         int maxDepth;
3         Queue toprocess;
4         DistributedHashMap results;
5         GlobalString gTitle;
6         GlobalString workingURL;
7
8   public QueryTask(Queue todoList, DistributedHashMap doneList, int maxDepth, DistributedHashMap results) {
9     this.todoList = todoList;
10                 this.doneList = doneList;
11                 this.maxDepth = maxDepth;
12                 this.results = results;
13                 toprocess = global new Queue();
14   }
15
16   public void execute() {
17                 int depth;
18                 int max;
19                 
20                 atomic {
21                         depth = ((GlobalQuery)myWork).getDepth();
22       max = this.maxDepth;
23                 }
24
25                 if (depth < max) {
26                         /* global variables */
27                         GlobalQuery gq;
28
29                         /* local variables */
30                         LocalQuery lq;
31                         String hostname;
32                         String path;
33                         String title;
34
35                         atomic {
36                                 gq = (GlobalQuery)myWork;
37                                 hostname = new String(GlobalString.toLocalCharArray(gq.getHostName()));
38                                 path = new String(GlobalString.toLocalCharArray(gq.getPath()));
39
40                                 GlobalStringBuffer gsb = global new GlobalStringBuffer(hostname);
41                                 gsb.append("/");
42                                 gsb.append(path);
43                                 workingURL = global new GlobalString(gsb.toGlobalString());
44                                 gTitle = null;
45                         }
46                         lq = new LocalQuery(hostname, path, depth);
47
48                         System.printString("["+lq.getDepth()+"] ");
49                         System.printString("Processing - Hostname : ");
50                         System.printString(hostname);
51                         System.printString(", Path : ");
52                         System.printString(path);
53                         System.printString("\n");
54
55                         if (isDocument(path)) {
56                                 return;
57                         }
58
59                         Socket s = new Socket();
60
61                         if(s.connect(hostname, 80) == -1) {
62                                 return;
63                         }
64
65                         requestQuery(hostname, path, s);
66                         readResponse(lq, s);
67
68                         if ((title = grabTitle(lq)) != null) {
69                                 atomic {
70                                         gTitle = global new GlobalString(title);
71                                 }
72                                 atomic {
73                                         toprocess = processPage(lq);
74                                 }
75                         }
76                         s.close();
77                 }
78   }
79         
80         public static boolean isDocument(String str) {
81                 int index = str.lastindexOf('.');
82
83                 if (index != -1) {
84                         if ((str.subString(index+1)).equals("pdf")) return true;
85                         else if ((str.subString(index+1)).equals("ps")) return true;
86                         else if ((str.subString(index+1)).equals("ppt")) return true;
87                         else if ((str.subString(index+1)).equals("pptx")) return true;
88                         else if ((str.subString(index+1)).equals("jpg")) return true;
89                         else if ((str.subString(index+1)).equals("mp3")) return true;
90                         else if ((str.subString(index+1)).equals("wmv")) return true;
91                         else if ((str.subString(index+1)).equals("doc")) return true;
92                         else if ((str.subString(index+1)).equals("docx")) return true;
93                         else if ((str.subString(index+1)).equals("mov")) return true;
94                         else if ((str.subString(index+1)).equals("flv")) return true;
95                         else return false;
96                 }
97                 return false;
98         }
99
100         public void done(Object obj) {
101                 if ((gTitle != null) && (gTitle.length() > 0)) {
102                         processList();
103                 }
104
105                 while(!toprocess.isEmpty()) {
106                         GlobalQuery q = (GlobalQuery)toprocess.pop();
107
108                         GlobalString hostname = global new GlobalString(q.getHostName());
109                         GlobalString path = global new GlobalString(q.getPath());
110
111                         GlobalStringBuffer gsb = global new GlobalStringBuffer(hostname);
112                         gsb.append("/");
113                         gsb.append(path);
114
115                         if (!doneList.containsKey(gsb.toGlobalString())) {
116                                 todoList.push(q);
117                                         
118                                 GlobalString str = global new GlobalString("1");
119                                 doneList.put(gsb.toGlobalString(), str);
120                         }
121                 }
122         }
123
124         public static String grabTitle(LocalQuery lq) {
125                 String sBrace = new String("<");        
126                 String strTitle = new String("title>");
127         String searchstr = lq.response.toString();
128                 String title = null;
129                 char ch;
130
131                 int mindex = -1;
132                 int endquote = -1;
133                 int i, j;
134                 String tmp;
135
136                 for (i = 0; i < searchstr.length(); i++) {
137                         if (searchstr.charAt(i) == '<') {
138                                 i++;
139                                 if (searchstr.length() > (i+strTitle.length())) {
140                                         tmp = searchstr.subString(i, i+strTitle.length());
141                                         if (tmp.equalsIgnoreCase("title>")) {
142                                                 mindex = i + tmp.length();
143                                                 for (j = mindex; j < searchstr.length(); j++) {
144                                                         if (searchstr.charAt(j) == '<') {
145                                                                 j++;
146                                                                 tmp = searchstr.subString(j, j+strTitle.length()+1);                    
147                                                                 if (tmp.equalsIgnoreCase("/title>")) {
148                                                                         endquote = j - 1;
149                                                                         break;
150                                                                 }
151                                                         }
152                                                 }
153                                         }
154                                 }
155                         }
156                 }
157
158                 if (mindex != -1) {
159                         title = searchstr.subString(mindex, endquote);
160                         if (Character.isWhitespace(title.charAt(0))){
161                                 mindex=0;
162                                 while (Character.isWhitespace(title.charAt(mindex++)));
163                                 mindex--;
164                                 if (mindex >= title.length()) return null;
165                                 title = new String(title.subString(mindex));
166                         }
167
168                         if (Character.isWhitespace(title.charAt(title.length()-1))) {
169                                 endquote=title.length()-1;
170                                 while (Character.isWhitespace(title.charAt(endquote--)));
171                                 endquote += 2;
172                                 if (mindex >= endquote) return null;
173                                 title = new String(title.subString(0, endquote));
174                         }
175
176                         if (isErrorPage(title)) {
177                                 return null;
178                         }
179                 }
180
181                 return title;
182         }
183
184         public static boolean isErrorPage(String str) { 
185                 if (str.equals("301 Moved Permanently")) 
186                         return true;
187                 else if (str.equals("302 Found")) 
188                         return true;
189                 else if (str.equals("404 Not Found")) 
190                         return true;
191                 else if (str.equals("403 Forbidden")) 
192                         return true;
193                 else if (str.equals("404 File Not Found")) 
194                         return true;
195                 else
196                         return false;
197         }
198
199         public static void requestQuery(String hostname, String path, Socket sock) {
200     StringBuffer req = new StringBuffer("GET "); 
201     req.append("/");
202                 req.append(path);
203           req.append(" HTTP/1.0\r\nHost: ");
204     req.append(hostname);
205     req.append("\r\n\r\n");
206     sock.write(req.toString().getBytes());
207   }
208
209         public static void readResponse(LocalQuery lq, Socket sock) {
210         //    state 0 - nothing
211         //    state 1 - \r
212         //    state 2 - \r\n
213         //    state 3 - \r\n\r
214         //    state 4 - \r\n\r\n
215                 byte[] buffer = new byte[1024];
216                 int numchars;
217
218                 do {
219                         numchars = sock.read(buffer);
220
221                         String curr = (new String(buffer)).subString(0, numchars);
222                         
223                         lq.response.append(curr);
224                         buffer = new byte[1024];
225                 } while(numchars > 0);
226   }
227
228         public void processList() {
229                 LinkedList ll;
230                 GlobalString token = null;
231                 int mindex = 0;
232                 int endquote = 0;
233
234                 while (endquote != -1) {
235                         endquote = gTitle.indexOf(' ', mindex);
236
237                         if (endquote != -1) {
238                                 token = gTitle.subString(mindex, endquote);
239                                 mindex = endquote + 1;
240                                 if (filter(token)) {
241                                         continue;
242                                 }
243                                 token = refine(token);
244                         }
245                         else {
246                                 token = gTitle.subString(mindex);
247                                 token = refine(token);
248                         }
249
250                         Queue q = (Queue)results.get(token);
251                         if (q == null) {
252                                 q = global new Queue();
253                         }
254                         q.push(workingURL);     
255                         results.put(token, q);
256                 }
257         }
258
259         public boolean filter(GlobalString str) {
260                 if (str.equals("of"))   return true;
261                 else if (str.equals("for")) return true;
262                 else if (str.equals("a")) return true;
263                 else if (str.equals("an")) return true;
264                 else if (str.equals("the")) return true;
265                 else if (str.equals("at")) return true;
266                 else if (str.equals("and")) return true;
267                 else if (str.equals("or")) return true;
268                 else if (str.equals("but")) return true;
269                 else if (str.equals("to")) return true;
270                 else if (str.equals("The")) return true;
271                 else if (str.length() == 1) {
272                         if (str.charAt(0) == '.') return true;
273                         else if (str.charAt(0) == '.') return true;
274                         else if (str.charAt(0) == '-') return true;
275                         else if (str.charAt(0) == '=') return true;
276                         else if (str.charAt(0) == '_') return true;
277                         else if (str.charAt(0) == ':') return true;
278                         else if (str.charAt(0) == ';') return true;
279                         else if (str.charAt(0) == '\'') return true;
280                         else if (str.charAt(0) == '\"') return true;
281                         else if (str.charAt(0) == '|') return true;
282                         else if (str.charAt(0) == '@') return true;
283                         else if (str.charAt(0) == '&') return true;
284                         else if (str.charAt(0) == ' ') return true;
285                 }
286                 else return false;
287         }
288
289         public GlobalString refine(GlobalString str) {
290                 str = refinePrefix(str);
291                 str = refinePostfix(str);
292                 return str;
293         }
294
295         public GlobalString refinePrefix(GlobalString str) {
296                 if (str.charAt(0) == '&') {             // &
297                         return str.subString(1);
298                 }
299                 else if (str.charAt(0) == '/') {                // &
300                         return str.subString(1);
301                 }
302                 return str;
303         }
304
305         public GlobalString refinePostfix(GlobalString str) {
306                 if (str.charAt(str.length()-1) == ',') {                        // ,
307                         return str.subString(0, str.length()-1);
308                 }
309                 else if (str.charAt(str.length()-1) == ':') {           // :
310                         return str.subString(0, str.length()-1);
311                 }
312                 else if (str.charAt(str.length()-1) == ';') {           // ;
313                         return str.subString(0, str.length()-1);
314                 }
315                 else if (str.charAt(str.length()-1) == '!') {           // !
316                         return str.subString(0, str.length()-1);
317                 }
318                 else if (str.charAt(str.length()-1) == 's') {                   // 's
319                         if (str.charAt(str.length()-2) == '\'')
320                                 return str.subString(0, str.length()-2);        
321                 }
322                 else if (str.charAt(str.length()-1) == '-') {
323                         int index = str.length()-2;
324                         while (Character.isWhitespace(str.charAt(index--)));
325                         return str.subString(0, index+2);
326                 }
327                 return str;
328         }
329         
330   public static Queue processPage(LocalQuery lq) {
331     int index = 0;
332         String href = new String("href=\"");
333         String searchstr = lq.response.toString();
334                 int depth;
335         boolean cont = true;
336                 Queue toprocess;
337
338                 depth = lq.getDepth() + 1;
339
340                 toprocess = global new Queue();
341                 while(cont) {
342                         int mindex = searchstr.indexOf(href,index);
343                         if (mindex != -1) {     
344                                 int endquote = searchstr.indexOf('"', mindex+href.length());
345                 if (endquote != -1) {
346                       String match = searchstr.subString(mindex+href.length(), endquote);
347                                         String match2 = lq.makewebcanonical(match);
348         
349                                         GlobalString ghostname;
350                                         GlobalString gpath;
351
352                                         ghostname = global new GlobalString(lq.getHostName(match));
353                                         gpath = global new GlobalString(lq.getPathName(match));
354
355                       if (match2 != null) {
356                                                         GlobalQuery gq = global new GlobalQuery(ghostname, gpath, depth);
357                                                         toprocess.push(gq);
358                                         }
359                                         index = endquote;
360         } else cont = false;
361       } else cont = false;
362     }
363                 return toprocess;
364   }
365 }