/* * The Alecto crawler and storage engine is released under GPL 2.0. * A license should be distributed with this code. This statement * must be released with any use of this code. * * Copyright David Kellogg, 2008 */ #include #include "crawler.h" #include "re.h" #include "client.h" #include "json/json.h" /* create crawler add url run crawl on list iterate through outbound queue ** send outbound queue to server check robots.txt cache first check robots.txt */ using namespace std; vector get_urls(string output); map used_urls; map downloaded_urls; int main(int argc, char** argv) { string base = "localhost/manual/"; int sleep_time = 0; string urls_output = client_get("urls"); vector urls = get_urls(urls_output); string down_output = client_get("downloaded"); string old_down_output = client_get("old_downloaded"); vector down_list = get_urls(down_output); vector old_down_list = get_urls(old_down_output); down_list.insert(down_list.begin(),old_down_list.begin(),old_down_list.end()); //for(unsigned int i=0;i< down_list.size();i++) { //printf("downlist is %s\n",down_list[i].c_str()); //} map downloaded_map = Crawler::to_map(down_list); // Create crawler Crawler* c = new Crawler(); // Check cache for(unsigned int i=0; i < urls.size(); i++) { string hostname = Host::get_hostname(urls[i].c_str()); string robots_url = hostname +"/robots.txt"; string robots = client_get(robots_url.c_str()); if(robots.size() == 0) { // ROBOTS MISS c->add_url(robots_url.c_str(), "__ROOT__"); c->run_crawler(); char* page = c->get_page(0); //printf("robots is %s\n",page); // SEND ROBOTS to server client_put(robots_url, page); string rob = client_get(robots_url); c->read_robots(hostname.c_str(), page); c->remove_url(robots_url.c_str()); sleep_time = 11; } else { // ROBOTS HIT c->read_robots(hostname.c_str(), robots); } } // LONG if ROBOTS downloaded sleep(sleep_time); // ADD Urls for(unsigned int i=0;iadd_url(urls[i].c_str(), "__ROOT__"); } else { //printf("2.2 not adding url %s\n", urls[i].c_str()); } } // Run the crawler // THIS stores page data c->run_crawler(); // Once urls are added, iterate over downs, not urls // STORE pages in memory char* urls_text_begin = "{ \"urls\": [ "; char* text_end = " ] }"; char* links_text_begin = "{ \"links\": [ "; string downloaded = urls_text_begin; string old_down_text = urls_text_begin; string failed = urls_text_begin; string delayed = urls_text_begin; string urls_text = urls_text_begin; for(unsigned int i=0; i < c->get_downs_size();i++) { //printf("down: "); Download* down = c->get_down(i); //printf("down: %s\n", down->get_url().c_str()); //printf("down: %s\n", down->get_page()); if(down->get_http_code() == 200) { downloaded_urls[down->get_url()] = true; // PUT PAGE of chunk memory base = down->get_base(); // printf("200 for %s\n", down->get_url().c_str()); client_put(down->get_url(), down->get_page()); c->extract_links(down, down->get_page(), base); vector links = down->get_links(); string links_text = links_text_begin; for(unsigned int ln=0;ln strlen(links_text_begin)) { links_text += ", "; urls_text += ", "; } links_text += "\""; links_text += links[ln]; links_text += "\""; urls_text += "\""; urls_text += links[ln]; urls_text += "\""; } } links_text += text_end; string links_key = "links/"; links_key += down->get_url(); client_put(links_key.c_str(), links_text.c_str()); if(! used_urls[down->get_url()] ) { if(downloaded.size() > strlen(urls_text_begin)) { downloaded += ", "; } downloaded += "\""; downloaded += down->get_url(); downloaded += "\""; used_urls[down->get_url()] = true; } } else { if(failed.size() > strlen(urls_text_begin)) { failed += ", "; } failed += "\""; failed += down->get_url(); failed += "\""; } } vector::iterator it; vector delays = c->get_delayed_pages(); for(it=delays.begin(); it != delays.end(); it++) { if(! used_urls[*it] ) { if(delayed.size() > strlen(urls_text_begin)) { delayed += ", "; } delayed += "\""; delayed += *it; //printf("delayed: %s\n", it->c_str()); delayed += "\""; used_urls[*it] = true; } } //vector::iterator it; for(it=down_list.begin(); it != down_list.end(); it++) { if(! used_urls[*it] ) { if(old_down_text.size() > strlen(urls_text_begin)) { old_down_text += ", "; } old_down_text += "\""; old_down_text += *it; // printf("old_down: %s\n", it->c_str()); old_down_text += "\""; used_urls[*it] = true; } } old_down_text += text_end; //printf("delayed is %s\n", delayed.c_str()); downloaded += text_end; client_put("downloaded", downloaded.c_str()); client_put("old_downloaded", old_down_text.c_str()); //printf("old_downloaded: %s\n", downloaded.c_str()); failed += text_end; client_put("failed", failed.c_str()); delayed += text_end; client_put("delayed", delayed.c_str()); urls_text += text_end; client_put("urls", urls_text.c_str()); string state_saved = client_command("save_state"); //printf("state result: %s \n",state_saved.c_str() ); return 0; } /* * * Get json-encoded URLs. * */ vector get_urls(string output) { vector urls_out; if(output.length() == 0) { return urls_out; } char* out = (char*)output.c_str(); //printf("get urls out: %s\n", out); json_object* url_obj = json_tokener_parse(out); json_object* urls = json_object_object_get(url_obj, "urls"); for(int i=0; i < json_object_array_length(urls); i++) { json_object* url = json_object_array_get_idx(urls, i); string url_text = json_object_get_string(url); urls_out.push_back(url_text); } return urls_out; }