1 public class QueryTask extends Task {
4 DistributedHashMap results;
6 GlobalString workingURL;
8 public QueryTask(Queue todoList, DistributedHashMap doneList, int maxDepth, DistributedHashMap results) {
9 this.todoList = todoList;
10 this.doneList = doneList;
11 this.maxDepth = maxDepth;
12 this.results = results;
13 toprocess = global new Queue();
16 public void execute() {
21 depth = ((GlobalQuery)myWork).getDepth();
26 /* global variables */
36 gq = (GlobalQuery)myWork;
37 hostname = new String(GlobalString.toLocalCharArray(gq.getHostName()));
38 path = new String(GlobalString.toLocalCharArray(gq.getPath()));
40 GlobalStringBuffer gsb = global new GlobalStringBuffer(hostname);
43 workingURL = global new GlobalString(gsb.toGlobalString());
46 lq = new LocalQuery(hostname, path, depth);
48 System.printString("["+lq.getDepth()+"] ");
49 System.printString("Processing - Hostname : ");
50 System.printString(hostname);
51 System.printString(", Path : ");
52 System.printString(path);
53 System.printString("\n");
55 if (isDocument(path)) {
59 Socket s = new Socket();
61 if(s.connect(hostname, 80) == -1) {
65 requestQuery(hostname, path, s);
68 if ((title = grabTitle(lq)) != null) {
70 gTitle = global new GlobalString(title);
73 toprocess = processPage(lq);
80 public static boolean isDocument(String str) {
81 int index = str.lastindexOf('.');
84 if ((str.subString(index+1)).equals("pdf")) return true;
85 else if ((str.subString(index+1)).equals("ps")) return true;
86 else if ((str.subString(index+1)).equals("ppt")) return true;
87 else if ((str.subString(index+1)).equals("pptx")) return true;
88 else if ((str.subString(index+1)).equals("jpg")) return true;
89 else if ((str.subString(index+1)).equals("mp3")) return true;
90 else if ((str.subString(index+1)).equals("wmv")) return true;
91 else if ((str.subString(index+1)).equals("doc")) return true;
92 else if ((str.subString(index+1)).equals("docx")) return true;
93 else if ((str.subString(index+1)).equals("mov")) return true;
94 else if ((str.subString(index+1)).equals("flv")) return true;
100 public void done(Object obj) {
101 if ((gTitle != null) && (gTitle.length() > 0)) {
105 while(!toprocess.isEmpty()) {
106 GlobalQuery q = (GlobalQuery)toprocess.pop();
108 GlobalString hostname = global new GlobalString(q.getHostName());
109 GlobalString path = global new GlobalString(q.getPath());
111 GlobalStringBuffer gsb = global new GlobalStringBuffer(hostname);
115 if (!doneList.containsKey(gsb.toGlobalString())) {
118 GlobalString str = global new GlobalString("1");
119 doneList.put(gsb.toGlobalString(), str);
124 public static String grabTitle(LocalQuery lq) {
125 String sBrace = new String("<");
126 String strTitle = new String("title>");
127 String searchstr = lq.response.toString();
136 for (i = 0; i < searchstr.length(); i++) {
137 if (searchstr.charAt(i) == '<') {
139 if (searchstr.length() > (i+strTitle.length())) {
140 tmp = searchstr.subString(i, i+strTitle.length());
141 if (tmp.equalsIgnoreCase("title>")) {
142 mindex = i + tmp.length();
143 for (j = mindex; j < searchstr.length(); j++) {
144 if (searchstr.charAt(j) == '<') {
146 tmp = searchstr.subString(j, j+strTitle.length()+1);
147 if (tmp.equalsIgnoreCase("/title>")) {
159 title = searchstr.subString(mindex, endquote);
160 if (Character.isWhitespace(title.charAt(0))){
162 while (Character.isWhitespace(title.charAt(mindex++)));
164 if (mindex >= title.length()) return null;
165 title = new String(title.subString(mindex));
168 if (Character.isWhitespace(title.charAt(title.length()-1))) {
169 endquote=title.length()-1;
170 while (Character.isWhitespace(title.charAt(endquote--)));
172 if (mindex >= endquote) return null;
173 title = new String(title.subString(0, endquote));
176 if (isErrorPage(title)) {
184 public static boolean isErrorPage(String str) {
185 if (str.equals("301 Moved Permanently"))
187 else if (str.equals("302 Found"))
189 else if (str.equals("404 Not Found"))
191 else if (str.equals("403 Forbidden"))
193 else if (str.equals("404 File Not Found"))
199 public static void requestQuery(String hostname, String path, Socket sock) {
200 StringBuffer req = new StringBuffer("GET ");
203 req.append(" HTTP/1.0\r\nHost: ");
204 req.append(hostname);
205 req.append("\r\n\r\n");
206 sock.write(req.toString().getBytes());
209 public static void readResponse(LocalQuery lq, Socket sock) {
214 // state 4 - \r\n\r\n
215 byte[] buffer = new byte[1024];
219 numchars = sock.read(buffer);
221 String curr = (new String(buffer)).subString(0, numchars);
223 lq.response.append(curr);
224 buffer = new byte[1024];
225 } while(numchars > 0);
228 public void processList() {
230 GlobalString token = null;
234 while (endquote != -1) {
235 endquote = gTitle.indexOf(' ', mindex);
237 if (endquote != -1) {
238 token = gTitle.subString(mindex, endquote);
239 mindex = endquote + 1;
243 token = refine(token);
246 token = gTitle.subString(mindex);
247 token = refine(token);
250 Queue q = (Queue)results.get(token);
252 q = global new Queue();
255 results.put(token, q);
259 public boolean filter(GlobalString str) {
260 if (str.equals("of")) return true;
261 else if (str.equals("for")) return true;
262 else if (str.equals("a")) return true;
263 else if (str.equals("an")) return true;
264 else if (str.equals("the")) return true;
265 else if (str.equals("at")) return true;
266 else if (str.equals("and")) return true;
267 else if (str.equals("or")) return true;
268 else if (str.equals("but")) return true;
269 else if (str.equals("to")) return true;
270 else if (str.equals("The")) return true;
271 else if (str.length() == 1) {
272 if (str.charAt(0) == '.') return true;
273 else if (str.charAt(0) == '.') return true;
274 else if (str.charAt(0) == '-') return true;
275 else if (str.charAt(0) == '=') return true;
276 else if (str.charAt(0) == '_') return true;
277 else if (str.charAt(0) == ':') return true;
278 else if (str.charAt(0) == ';') return true;
279 else if (str.charAt(0) == '\'') return true;
280 else if (str.charAt(0) == '\"') return true;
281 else if (str.charAt(0) == '|') return true;
282 else if (str.charAt(0) == '@') return true;
283 else if (str.charAt(0) == '&') return true;
284 else if (str.charAt(0) == ' ') return true;
289 public GlobalString refine(GlobalString str) {
290 str = refinePrefix(str);
291 str = refinePostfix(str);
295 public GlobalString refinePrefix(GlobalString str) {
296 if (str.charAt(0) == '&') { // &
297 return str.subString(1);
299 else if (str.charAt(0) == '/') { // &
300 return str.subString(1);
305 public GlobalString refinePostfix(GlobalString str) {
306 if (str.charAt(str.length()-1) == ',') { // ,
307 return str.subString(0, str.length()-1);
309 else if (str.charAt(str.length()-1) == ':') { // :
310 return str.subString(0, str.length()-1);
312 else if (str.charAt(str.length()-1) == ';') { // ;
313 return str.subString(0, str.length()-1);
315 else if (str.charAt(str.length()-1) == '!') { // !
316 return str.subString(0, str.length()-1);
318 else if (str.charAt(str.length()-1) == 's') { // 's
319 if (str.charAt(str.length()-2) == '\'')
320 return str.subString(0, str.length()-2);
322 else if (str.charAt(str.length()-1) == '-') {
323 int index = str.length()-2;
324 while (Character.isWhitespace(str.charAt(index--)));
325 return str.subString(0, index+2);
330 public static Queue processPage(LocalQuery lq) {
332 String href = new String("href=\"");
333 String searchstr = lq.response.toString();
338 depth = lq.getDepth() + 1;
340 toprocess = global new Queue();
342 int mindex = searchstr.indexOf(href,index);
344 int endquote = searchstr.indexOf('"', mindex+href.length());
345 if (endquote != -1) {
346 String match = searchstr.subString(mindex+href.length(), endquote);
347 String match2 = lq.makewebcanonical(match);
349 GlobalString ghostname;
352 ghostname = global new GlobalString(lq.getHostName(match));
353 gpath = global new GlobalString(lq.getPathName(match));
355 if (match2 != null) {
356 GlobalQuery gq = global new GlobalQuery(ghostname, gpath, depth);