*** empty log message ***
[IRC.git] / Robust / src / Benchmarks / Recovery / Spider / QueryTask.java
1 public class QueryTask extends Task {
2         int maxDepth;
3         Queue toprocess;
4         DistributedHashMap results;
5         DistributedHashMap visitedList;
6         GlobalString gTitle;
7         GlobalString workingURL;
8
9   public QueryTask(Queue todoList, DistributedHashMap visitedList, int maxDepth, DistributedHashMap results) {
10     this.todoList = todoList;
11                 this.visitedList = visitedList;
12                 this.maxDepth = maxDepth;
13                 this.results = results;
14                 toprocess = global new Queue();
15   }
16
17   public void execute() {
18                 int depth;
19                 int max;
20                 
21                 atomic {
22                         depth = ((GlobalQuery)myWork).getDepth();
23       max = this.maxDepth;
24                 }
25
26                 if (depth < max) {
27                         /* global variables */
28                         GlobalQuery gq;
29
30                         /* local variables */
31                         LocalQuery lq;
32                         String hostname;
33                         String path;
34                         String title;
35
36                         atomic {
37                                 gq = (GlobalQuery)myWork;
38                                 hostname = new String(GlobalString.toLocalCharArray(gq.getHostName()));
39                                 path = new String(GlobalString.toLocalCharArray(gq.getPath()));
40
41                                 GlobalStringBuffer gsb = global new GlobalStringBuffer(hostname);
42                                 gsb.append("/");
43                                 gsb.append(path);
44                                 workingURL = global new GlobalString(gsb.toGlobalString());
45                                 gTitle = null;
46                         }
47                         lq = new LocalQuery(hostname, path, depth);
48
49                         System.printString("["+lq.getDepth()+"] ");
50                         System.printString("Processing - Hostname : ");
51                         System.printString(hostname);
52                         System.printString(", Path : ");
53                         System.printString(path);
54                         System.printString("\n");
55
56                         if (isDocument(path)) {
57                                 return;
58                         }
59
60                         Socket s = new Socket();
61
62                         if(s.connect(hostname, 80) == -1) {
63                                 return;
64                         }
65
66                         requestQuery(hostname, path, s);
67                         readResponse(lq, s);
68
69                         if ((title = grabTitle(lq)) != null) {
70                                 atomic {
71                                         gTitle = global new GlobalString(title);
72                                 }
73                                 atomic {
74                                         toprocess = processPage(lq);
75                                 }
76                         }
77                         s.close();
78                 }
79   }
80         
81         public static boolean isDocument(String str) {
82                 int index = str.lastindexOf('.');
83
84                 if (index != -1) {
85                         if ((str.subString(index+1)).equals("pdf")) return true;
86                         else if ((str.subString(index+1)).equals("ps")) return true;
87                         else if ((str.subString(index+1)).equals("ppt")) return true;
88                         else if ((str.subString(index+1)).equals("pptx")) return true;
89                         else if ((str.subString(index+1)).equals("jpg")) return true;
90                         else if ((str.subString(index+1)).equals("mp3")) return true;
91                         else if ((str.subString(index+1)).equals("wmv")) return true;
92                         else if ((str.subString(index+1)).equals("doc")) return true;
93                         else if ((str.subString(index+1)).equals("docx")) return true;
94                         else if ((str.subString(index+1)).equals("mov")) return true;
95                         else if ((str.subString(index+1)).equals("flv")) return true;
96                         else return false;
97                 }
98                 return false;
99         }
100
101         public void done(Object obj) {
102                 if ((gTitle != null) && (gTitle.length() > 0)) {
103                         processList();
104                 }
105
106                 while(!toprocess.isEmpty()) {
107                         GlobalQuery q = (GlobalQuery)toprocess.pop();
108
109                         GlobalString hostname = global new GlobalString(q.getHostName());
110                         GlobalString path = global new GlobalString(q.getPath());
111
112                         GlobalStringBuffer gsb = global new GlobalStringBuffer(hostname);
113                         gsb.append("/");
114                         gsb.append(path);
115
116                         if (!visitedList.containsKey(gsb.toGlobalString())) {
117                                 todoList.push(q);
118                                         
119                                 GlobalString str = global new GlobalString("1");
120                                 visitedList.put(gsb.toGlobalString(), str);
121                         }
122                 }
123         }
124
125         public static String grabTitle(LocalQuery lq) {
126                 String sBrace = new String("<");        
127                 String strTitle = new String("title>");
128         String searchstr = lq.response.toString();
129                 String title = null;
130                 char ch;
131
132                 int mindex = -1;
133                 int endquote = -1;
134                 int i, j;
135                 String tmp;
136
137                 for (i = 0; i < searchstr.length(); i++) {
138                         if (searchstr.charAt(i) == '<') {
139                                 i++;
140                                 if (searchstr.length() > (i+strTitle.length())) {
141                                         tmp = searchstr.subString(i, i+strTitle.length());
142                                         if (tmp.equalsIgnoreCase("title>")) {
143                                                 mindex = i + tmp.length();
144                                                 for (j = mindex; j < searchstr.length(); j++) {
145                                                         if (searchstr.charAt(j) == '<') {
146                                                                 j++;
147                                                                 tmp = searchstr.subString(j, j+strTitle.length()+1);                    
148                                                                 if (tmp.equalsIgnoreCase("/title>")) {
149                                                                         endquote = j - 1;
150                                                                         break;
151                                                                 }
152                                                         }
153                                                 }
154                                         }
155                                 }
156                         }
157                 }
158
159                 if (mindex != -1) {
160                         title = searchstr.subString(mindex, endquote);
161                         if (Character.isWhitespace(title.charAt(0))){
162                                 mindex=0;
163                                 while (Character.isWhitespace(title.charAt(mindex++)));
164                                 mindex--;
165                                 if (mindex >= title.length()) return null;
166                                 title = new String(title.subString(mindex));
167                         }
168
169                         if (Character.isWhitespace(title.charAt(title.length()-1))) {
170                                 endquote=title.length()-1;
171                                 while (Character.isWhitespace(title.charAt(endquote--)));
172                                 endquote += 2;
173                                 if (mindex >= endquote) return null;
174                                 title = new String(title.subString(0, endquote));
175                         }
176
177                         if (isErrorPage(title)) {
178                                 return null;
179                         }
180                 }
181
182                 return title;
183         }
184
185         public static boolean isErrorPage(String str) { 
186                 if (str.equals("301 Moved Permanently")) 
187                         return true;
188                 else if (str.equals("302 Found")) 
189                         return true;
190                 else if (str.equals("404 Not Found")) 
191                         return true;
192                 else if (str.equals("403 Forbidden")) 
193                         return true;
194                 else if (str.equals("404 File Not Found")) 
195                         return true;
196                 else
197                         return false;
198         }
199
200         public static void requestQuery(String hostname, String path, Socket sock) {
201     StringBuffer req = new StringBuffer("GET "); 
202     req.append("/");
203                 req.append(path);
204           req.append(" HTTP/1.0\r\nHost: ");
205     req.append(hostname);
206     req.append("\r\n\r\n");
207     sock.write(req.toString().getBytes());
208   }
209
210         public static void readResponse(LocalQuery lq, Socket sock) {
211         //    state 0 - nothing
212         //    state 1 - \r
213         //    state 2 - \r\n
214         //    state 3 - \r\n\r
215         //    state 4 - \r\n\r\n
216                 byte[] buffer = new byte[1024];
217                 int numchars;
218
219                 do {
220                         numchars = sock.read(buffer);
221
222                         String curr = (new String(buffer)).subString(0, numchars);
223                         
224                         lq.response.append(curr);
225                         buffer = new byte[1024];
226                 } while(numchars > 0);
227   }
228
229         public void processList() {
230                 LinkedList ll;
231                 GlobalString token = null;
232                 int mindex = 0;
233                 int endquote = 0;
234
235                 while (endquote != -1) {
236                         endquote = gTitle.indexOf(' ', mindex);
237
238                         if (endquote != -1) {
239                                 token = gTitle.subString(mindex, endquote);
240                                 mindex = endquote + 1;
241                                 if (filter(token)) {
242                                         continue;
243                                 }
244                                 token = refine(token);
245                         }
246                         else {
247                                 token = gTitle.subString(mindex);
248                                 token = refine(token);
249                         }
250
251                         Queue q = (Queue)results.get(token);
252                         if (q == null) {
253                                 q = global new Queue();
254                         }
255                         q.push(workingURL);     
256                         results.put(token, q);
257                 }
258         }
259
260         public boolean filter(GlobalString str) {
261                 if (str.equals("of"))   return true;
262                 else if (str.equals("for")) return true;
263                 else if (str.equals("a")) return true;
264                 else if (str.equals("an")) return true;
265                 else if (str.equals("the")) return true;
266                 else if (str.equals("at")) return true;
267                 else if (str.equals("and")) return true;
268                 else if (str.equals("or")) return true;
269                 else if (str.equals("but")) return true;
270                 else if (str.equals("to")) return true;
271                 else if (str.equals("The")) return true;
272                 else if (str.length() == 1) {
273                         if (str.charAt(0) == '.') return true;
274                         else if (str.charAt(0) == '.') return true;
275                         else if (str.charAt(0) == '-') return true;
276                         else if (str.charAt(0) == '=') return true;
277                         else if (str.charAt(0) == '_') return true;
278                         else if (str.charAt(0) == ':') return true;
279                         else if (str.charAt(0) == ';') return true;
280                         else if (str.charAt(0) == '\'') return true;
281                         else if (str.charAt(0) == '\"') return true;
282                         else if (str.charAt(0) == '|') return true;
283                         else if (str.charAt(0) == '@') return true;
284                         else if (str.charAt(0) == '&') return true;
285                         else if (str.charAt(0) == ' ') return true;
286                 }
287                 else return false;
288         }
289
290         public GlobalString refine(GlobalString str) {
291                 str = refinePrefix(str);
292                 str = refinePostfix(str);
293                 return str;
294         }
295
296         public GlobalString refinePrefix(GlobalString str) {
297                 if (str.charAt(0) == '&') {             // &
298                         return str.subString(1);
299                 }
300                 else if (str.charAt(0) == '/') {                // &
301                         return str.subString(1);
302                 }
303                 return str;
304         }
305
306         public GlobalString refinePostfix(GlobalString str) {
307                 if (str.charAt(str.length()-1) == ',') {                        // ,
308                         return str.subString(0, str.length()-1);
309                 }
310                 else if (str.charAt(str.length()-1) == ':') {           // :
311                         return str.subString(0, str.length()-1);
312                 }
313                 else if (str.charAt(str.length()-1) == ';') {           // ;
314                         return str.subString(0, str.length()-1);
315                 }
316                 else if (str.charAt(str.length()-1) == '!') {           // !
317                         return str.subString(0, str.length()-1);
318                 }
319                 else if (str.charAt(str.length()-1) == 's') {                   // 's
320                         if (str.charAt(str.length()-2) == '\'')
321                                 return str.subString(0, str.length()-2);        
322                 }
323                 else if (str.charAt(str.length()-1) == '-') {
324                         int index = str.length()-2;
325                         while (Character.isWhitespace(str.charAt(index--)));
326                         return str.subString(0, index+2);
327                 }
328                 return str;
329         }
330         
331   public static Queue processPage(LocalQuery lq) {
332     int index = 0;
333         String href = new String("href=\"");
334         String searchstr = lq.response.toString();
335                 int depth;
336         boolean cont = true;
337                 Queue toprocess;
338
339                 depth = lq.getDepth() + 1;
340
341                 toprocess = global new Queue();
342                 while(cont) {
343                         int mindex = searchstr.indexOf(href,index);
344                         if (mindex != -1) {     
345                                 int endquote = searchstr.indexOf('"', mindex+href.length());
346                 if (endquote != -1) {
347                       String match = searchstr.subString(mindex+href.length(), endquote);
348                                         String match2 = lq.makewebcanonical(match);
349         
350                                         GlobalString ghostname;
351                                         GlobalString gpath;
352
353                                         ghostname = global new GlobalString(lq.getHostName(match));
354                                         gpath = global new GlobalString(lq.getPathName(match));
355
356                       if (match2 != null) {
357                                                         GlobalQuery gq = global new GlobalQuery(ghostname, gpath, depth);
358                                                         toprocess.push(gq);
359                                         }
360                                         index = endquote;
361         } else cont = false;
362       } else cont = false;
363     }
364                 return toprocess;
365   }
366 }