diff options
Diffstat (limited to 'src/Scraping/Scraping.c')
| -rw-r--r-- | src/Scraping/Scraping.c | 589 |
1 files changed, 115 insertions, 474 deletions
diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c index 4c87890..baf536c 100644 --- a/src/Scraping/Scraping.c +++ b/src/Scraping/Scraping.c @@ -1,395 +1,20 @@ #include "Scraping.h" #include "../Cache/Cache.h" #include "../Proxy/Proxy.h" -#include "../Utility/Unescape.h" -#include "../Utility/XmlHelper.h" #include "Config.h" #include <curl/curl.h> #include <libxml/HTMLparser.h> -#include <libxml/xpath.h> #include <stdio.h> #include <stdlib.h> -#include <string.h> #include <time.h> -#include <unistd.h> - -static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, - void *userp) { - size_t realsize = size * nmemb; - MemoryBuffer *mem = (MemoryBuffer *)userp; - - if (mem->size + realsize + 1 > mem->capacity) { - size_t new_cap = - mem->capacity == 0 ? INITIAL_BUFFER_SIZE : mem->capacity * 2; - while (new_cap < mem->size + realsize + 1) - new_cap *= 2; - - char *ptr = (char *)realloc(mem->memory, new_cap); - if (!ptr) { - return 0; - } - mem->memory = ptr; - mem->capacity = new_cap; - } - - memcpy(&(mem->memory[mem->size]), contents, realsize); - mem->size += realsize; - mem->memory[mem->size] = 0; - - return realsize; -} - -static const char *get_random_user_agent(void) { - static const char *agents[] = { - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, " - "like Gecko) Chrome/120.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like " - "Gecko) " - "Chrome/120.0.0.0` Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 " - "Firefox/121.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 " - "(KHTML, like Gecko) Version/17.2 Safari/605.1.15"}; - return agents[rand() % 5]; -} - -static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc, - SearchResult **out_results, int max_results) { - (void)engine_name; - int found_count = 0; - - xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); - if (!xpathCtx) { - return 0; - } - - xmlXPathObjectPtr xpathObj = xml_xpath_eval( - xpathCtx, "//tr[not(contains(@class, " - "'result-sponsored'))]//a[@class='result-link']"); - - if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) { - if (xpathObj) - xmlXPathFreeObject(xpathObj); - xmlXPathFreeContext(xpathCtx); - return 0; - } - - int num_links = xpathObj->nodesetval->nodeNr; - *out_results = xml_result_alloc(num_links, max_results); - if (!*out_results) { - xmlXPathFreeObject(xpathObj); - xmlXPathFreeContext(xpathCtx); - return 0; - } - - for (int i = 0; i < num_links && found_count < max_results; i++) { - xmlNodePtr linkNode = xpathObj->nodesetval->nodeTab[i]; - char *title = xml_node_content(linkNode); - char *url = (char *)xmlGetProp(linkNode, (xmlChar *)"href"); - char *snippet_text = NULL; - - xmlNodePtr current = linkNode->parent; - while (current && xmlStrcasecmp(current->name, (const xmlChar *)"tr") != 0) - current = current->parent; - - if (current && current->next) { - xmlNodePtr snippetRow = current->next; - while (snippetRow && - xmlStrcasecmp(snippetRow->name, (const xmlChar *)"tr") != 0) - snippetRow = snippetRow->next; - if (snippetRow) { - xpathCtx->node = snippetRow; - xmlXPathObjectPtr sObj = - xml_xpath_eval(xpathCtx, ".//td[@class='result-snippet']"); - if (sObj && sObj->nodesetval && sObj->nodesetval->nodeNr > 0) { - snippet_text = xml_node_content(sObj->nodesetval->nodeTab[0]); - } - if (sObj) - xmlXPathFreeObject(sObj); - xpathCtx->node = NULL; - } - } - - (*out_results)[found_count].url = unescape_search_url(url); - (*out_results)[found_count].title = strdup(title ? title : "No Title"); - (*out_results)[found_count].snippet = - strdup(snippet_text ? snippet_text : ""); - found_count++; - - if (title) - xmlFree(title); - if (url) - xmlFree(url); - if (snippet_text) - xmlFree(snippet_text); - } - - xmlXPathFreeObject(xpathObj); - xmlXPathFreeContext(xpathCtx); - return found_count; -} - -static int parse_startpage(const char *engine_name, xmlDocPtr doc, - SearchResult **out_results, int max_results) { - (void)engine_name; - int found_count = 0; - - xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); - if (!xpathCtx) { - return 0; - } - - xmlXPathObjectPtr xpathObj = - xml_xpath_eval(xpathCtx, "//div[contains(@class, 'result')]"); - - if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) { - if (xpathObj) - xmlXPathFreeObject(xpathObj); - xmlXPathFreeContext(xpathCtx); - return 0; - } - - int num_results = xpathObj->nodesetval->nodeNr; - *out_results = xml_result_alloc(num_results, max_results); - if (!*out_results) { - xmlXPathFreeObject(xpathObj); - xmlXPathFreeContext(xpathCtx); - return 0; - } - - for (int i = 0; i < num_results && found_count < max_results; i++) { - xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i]; - xpathCtx->node = resultNode; - - xmlXPathObjectPtr linkObj = - xml_xpath_eval(xpathCtx, ".//a[contains(@class, 'result-link')]"); - char *url = - (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0) - ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0], - (xmlChar *)"href") - : NULL; - - xmlXPathObjectPtr titleObj = - xml_xpath_eval(xpathCtx, ".//h2[contains(@class, 'wgl-title')]"); - char *title = - (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0) - ? xml_node_content(titleObj->nodesetval->nodeTab[0]) - : NULL; - - xmlXPathObjectPtr snippetObj = - xml_xpath_eval(xpathCtx, ".//p[contains(@class, 'description')]"); - char *snippet_text = - (snippetObj && snippetObj->nodesetval && - snippetObj->nodesetval->nodeNr > 0) - ? xml_node_content(snippetObj->nodesetval->nodeTab[0]) - : NULL; - - if (url && title) { - (*out_results)[found_count].url = strdup(url); - (*out_results)[found_count].title = strdup(title); - (*out_results)[found_count].snippet = - strdup(snippet_text ? snippet_text : ""); - found_count++; - } - - if (title) - xmlFree(title); - if (url) - xmlFree(url); - if (snippet_text) - xmlFree(snippet_text); - if (linkObj) - xmlXPathFreeObject(linkObj); - if (titleObj) - xmlXPathFreeObject(titleObj); - if (snippetObj) - xmlXPathFreeObject(snippetObj); - } - - xpathCtx->node = NULL; - xmlXPathFreeObject(xpathObj); - xmlXPathFreeContext(xpathCtx); - return found_count; -} - -static int parse_yahoo(const char *engine_name, xmlDocPtr doc, - SearchResult **out_results, int max_results) { - (void)engine_name; - int found_count = 0; - - xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); - if (!xpathCtx) { - return 0; - } - - xmlXPathObjectPtr xpathObj = - xml_xpath_eval(xpathCtx, "//div[contains(@class, 'algo-sr')]"); - - if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) { - if (xpathObj) - xmlXPathFreeObject(xpathObj); - xmlXPathFreeContext(xpathCtx); - return 0; - } - int num_results = xpathObj->nodesetval->nodeNr; - *out_results = xml_result_alloc(num_results, max_results); - if (!*out_results) { - xmlXPathFreeObject(xpathObj); - xmlXPathFreeContext(xpathCtx); +int check_cache_for_job(ScrapeJob *job) { + if (get_cache_ttl_search() <= 0) return 0; - } - - for (int i = 0; i < num_results && found_count < max_results; i++) { - xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i]; - xpathCtx->node = resultNode; - - xmlXPathObjectPtr linkObj = xml_xpath_eval( - xpathCtx, ".//div[contains(@class, 'compTitle')]//a[@target='_blank']"); - char *url = - (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0) - ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0], - (xmlChar *)"href") - : NULL; - - xmlXPathObjectPtr titleObj = - xml_xpath_eval(xpathCtx, ".//h3[contains(@class, 'title')]"); - char *title = - (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0) - ? xml_node_content(titleObj->nodesetval->nodeTab[0]) - : NULL; - - xmlXPathObjectPtr snippetObj = - xml_xpath_eval(xpathCtx, ".//div[contains(@class, 'compText')]//p"); - char *snippet_text = - (snippetObj && snippetObj->nodesetval && - snippetObj->nodesetval->nodeNr > 0) - ? xml_node_content(snippetObj->nodesetval->nodeTab[0]) - : NULL; - - if (url && title) { - (*out_results)[found_count].url = unescape_search_url(url); - (*out_results)[found_count].title = strdup(title); - (*out_results)[found_count].snippet = - strdup(snippet_text ? snippet_text : ""); - found_count++; - } - - if (title) - xmlFree(title); - if (url) - xmlFree(url); - if (snippet_text) - xmlFree(snippet_text); - if (linkObj) - xmlXPathFreeObject(linkObj); - if (titleObj) - xmlXPathFreeObject(titleObj); - if (snippetObj) - xmlXPathFreeObject(snippetObj); - } - - xpathCtx->node = NULL; - xmlXPathFreeObject(xpathObj); - xmlXPathFreeContext(xpathCtx); - return found_count; -} - -const SearchEngine ENGINE_REGISTRY[] = { - {.name = "DuckDuckGo Lite", - .base_url = "https://lite.duckduckgo.com/lite/?q=", - .host_header = "lite.duckduckgo.com", - .referer = "https://lite.duckduckgo.com/", - .page_param = "s", - .page_multiplier = 30, - .page_base = 0, - .parser = parse_ddg_lite}, - {.name = "Startpage", - .base_url = "https://www.startpage.com/sp/search?q=", - .host_header = "www.startpage.com", - .referer = "https://www.startpage.com/", - .page_param = "page", - .page_multiplier = 1, - .page_base = 1, - .parser = parse_startpage}, - {.name = "Yahoo", - .base_url = "https://search.yahoo.com/search?p=", - .host_header = "search.yahoo.com", - .referer = "https://search.yahoo.com/", - .page_param = "b", - .page_multiplier = 10, - .page_base = 1, - .parser = parse_yahoo}}; - -const int ENGINE_COUNT = sizeof(ENGINE_REGISTRY) / sizeof(SearchEngine); - -#define CURL_TIMEOUT 15L -#define CURL_DNS_TIMEOUT 300L - -static void configure_curl_handle(CURL *curl, const char *full_url, - MemoryBuffer *chunk, - struct curl_slist *headers) { - curl_easy_setopt(curl, CURLOPT_URL, full_url); - curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)chunk); - curl_easy_setopt(curl, CURLOPT_USERAGENT, get_random_user_agent()); - - curl_easy_setopt(curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); - curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, ""); - curl_easy_setopt(curl, CURLOPT_DNS_CACHE_TIMEOUT, CURL_DNS_TIMEOUT); - curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); - curl_easy_setopt(curl, CURLOPT_TIMEOUT, CURL_TIMEOUT); - curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L); - curl_easy_setopt(curl, CURLOPT_COOKIEFILE, ""); - - apply_proxy_settings(curl); -} - -static char *build_search_url(const char *base_url, const char *page_param, - int page_multiplier, int page_base, - const char *encoded_query, int page) { - int page_value = (page < 1 ? 1 : page - 1) * page_multiplier + page_base; - char *url = malloc(BUFFER_SIZE_LARGE); - if (!url) { - return NULL; - } - snprintf(url, BUFFER_SIZE_LARGE, "%s%s&%s=%d", base_url, encoded_query, - page_param, page_value); - return url; -} - -static struct curl_slist *build_request_headers(const char *host_header, - const char *referer) { - struct curl_slist *headers = NULL; - char host_buf[BUFFER_SIZE_MEDIUM], ref_buf[BUFFER_SIZE_MEDIUM]; - - snprintf(host_buf, sizeof(host_buf), "Host: %s", host_header); - snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", referer); - - headers = curl_slist_append(headers, host_buf); - headers = curl_slist_append(headers, ref_buf); - headers = curl_slist_append( - headers, - "Accept: " - "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); - headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5"); - headers = curl_slist_append(headers, "DNT: 1"); - - return headers; -} - -static int check_cache_for_job(ScrapeJob *job) { - if (get_cache_ttl_search() <= 0) { - return 0; - } char *key = cache_compute_key(job->query, job->page, job->engine->name); - if (!key) { + if (!key) return 0; - } char *cached_data = NULL; size_t cached_size = 0; @@ -414,27 +39,31 @@ static int check_cache_for_job(ScrapeJob *job) { return 0; } -static void process_job_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) { - if (msg->data.result == CURLE_OK && job->response.size > 0) { - char *key = cache_compute_key(job->query, job->page, job->engine->name); - if (key && get_cache_ttl_search() > 0) { - cache_set(key, job->response.memory, job->response.size); - free(key); - } +void parse_and_cache_response(ScrapeJob *job) { + if (job->response.size == 0) { + job->results_count = 0; + return; + } - xmlDocPtr doc = htmlReadMemory( - job->response.memory, job->response.size, NULL, NULL, - HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); + char *key = cache_compute_key(job->query, job->page, job->engine->name); + if (key && get_cache_ttl_search() > 0) + cache_set(key, job->response.memory, job->response.size); + free(key); - if (doc) { - job->results_count = job->engine->parser( - job->engine->name, doc, job->out_results, job->max_results); - xmlFreeDoc(doc); - } + xmlDocPtr doc = htmlReadMemory( + job->response.memory, job->response.size, NULL, NULL, + HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); + + if (doc) { + job->results_count = job->engine->parser( + job->engine->name, doc, job->out_results, job->max_results); + xmlFreeDoc(doc); } else { job->results_count = 0; } +} +void cleanup_job_handle(ScrapeJob *job, CURL *handle) { struct curl_slist *headers = NULL; curl_easy_getinfo(handle, CURLINFO_PRIVATE, &headers); if (headers) @@ -444,67 +73,112 @@ static void process_job_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) { job->response.memory = NULL; } -int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) { - int retries = 0; +void process_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) { + if (msg->data.result == CURLE_OK) + parse_and_cache_response(job); + else + job->results_count = 0; -retry: - CURLM *multi_handle = curl_multi_init(); - if (!multi_handle) { + cleanup_job_handle(job, handle); +} + +int setup_job(ScrapeJob *job, CURLM *multi_handle) { + if (job->handle) + curl_easy_cleanup(job->handle); + if (job->response.memory) + free(job->response.memory); + + if (check_cache_for_job(job)) { + job->results_count = job->results_count > 0 ? job->results_count : 0; + return 0; + } + + char *encoded_query = curl_easy_escape(NULL, job->query, 0); + if (!encoded_query) + return -1; + + char *full_url = + build_search_url(job->engine->base_url, job->engine->page_param, + job->engine->page_multiplier, job->engine->page_base, + encoded_query, job->page); + free(encoded_query); + + if (!full_url) + return -1; + + job->handle = curl_easy_init(); + if (!job->handle) { + free(full_url); return -1; } - for (int i = 0; i < num_jobs; i++) { - ScrapeJob *job = &jobs[i]; + job->response.memory = (char *)malloc(INITIAL_BUFFER_SIZE); + job->response.size = 0; + job->response.capacity = INITIAL_BUFFER_SIZE; - if (job->handle) { - curl_easy_cleanup(job->handle); - job->handle = NULL; - } - if (job->response.memory) { - free(job->response.memory); - } + struct curl_slist *headers = + build_request_headers(job->engine->host_header, job->engine->referer); - if (check_cache_for_job(job)) { - job->results_count = job->results_count > 0 ? job->results_count : 0; - continue; - } + configure_curl_handle(job->handle, full_url, &job->response, headers); + curl_easy_setopt(job->handle, CURLOPT_PRIVATE, headers); - char *encoded_query = curl_easy_escape(NULL, job->query, 0); - if (!encoded_query) { - continue; - } + free(full_url); + curl_multi_add_handle(multi_handle, job->handle); + return 0; +} - char *full_url = - build_search_url(job->engine->base_url, job->engine->page_param, - job->engine->page_multiplier, job->engine->page_base, - encoded_query, job->page); - free(encoded_query); +int handle_responses(CURLM *multi_handle, ScrapeJob *jobs, int num_jobs) { + CURLMsg *msg; + int msgs_left; - if (!full_url) { + while ((msg = curl_multi_info_read(multi_handle, &msgs_left))) { + if (msg->msg != CURLMSG_DONE) continue; - } - job->handle = curl_easy_init(); - if (!job->handle) { - free(full_url); - continue; + CURL *handle = msg->easy_handle; + + for (int i = 0; i < num_jobs; i++) { + if (jobs[i].handle && jobs[i].handle == handle) { + process_response(&jobs[i], handle, msg); + curl_multi_remove_handle(multi_handle, handle); + curl_easy_cleanup(handle); + jobs[i].handle = NULL; + break; + } } + } - job->response.memory = (char *)malloc(INITIAL_BUFFER_SIZE); - job->response.size = 0; - job->response.capacity = INITIAL_BUFFER_SIZE; + return 0; +} - struct curl_slist *headers = - build_request_headers(job->engine->host_header, job->engine->referer); +int should_retry(ScrapeJob *jobs, int num_jobs) { + if (proxy_count <= 0) + return 0; - configure_curl_handle(job->handle, full_url, &job->response, headers); - curl_easy_setopt(job->handle, CURLOPT_PRIVATE, headers); + for (int i = 0; i < num_jobs; i++) { + if (jobs[i].results_count == 0 && jobs[i].response.size == 0) + return 1; + } + return 0; +} - free(full_url); - curl_multi_add_handle(multi_handle, job->handle); +int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) { + int retries = 0; + +retry: + CURLM *multi_handle = curl_multi_init(); + if (!multi_handle) + return -1; + + for (int i = 0; i < num_jobs; i++) { + if (setup_job(&jobs[i], multi_handle) != 0 && jobs[i].handle) { + curl_multi_remove_handle(multi_handle, jobs[i].handle); + curl_easy_cleanup(jobs[i].handle); + jobs[i].handle = NULL; + } } - usleep(100000 + (rand() % 100000)); + http_delay(); int still_running = 0; curl_multi_perform(multi_handle, &still_running); @@ -512,50 +186,17 @@ retry: do { int numfds = 0; CURLMcode mc = curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds); - - if (mc != CURLM_OK) { + if (mc != CURLM_OK) break; - } - curl_multi_perform(multi_handle, &still_running); } while (still_running); - CURLMsg *msg; - int msgs_left; - while ((msg = curl_multi_info_read(multi_handle, &msgs_left))) { - if (msg->msg == CURLMSG_DONE) { - CURL *handle = msg->easy_handle; - - for (int i = 0; i < num_jobs; i++) { - if (jobs[i].handle && jobs[i].handle == handle) { - ScrapeJob *job = &jobs[i]; - - process_job_response(job, handle, msg); - - curl_multi_remove_handle(multi_handle, handle); - if (handle) - curl_easy_cleanup(handle); - job->handle = NULL; - break; - } - } - } - } - + handle_responses(multi_handle, jobs, num_jobs); curl_multi_cleanup(multi_handle); - if (retries < max_proxy_retries && proxy_count > 0) { - int any_failed = 0; - for (int i = 0; i < num_jobs; i++) { - if (jobs[i].results_count == 0 && jobs[i].response.size == 0) { - any_failed = 1; - break; - } - } - if (any_failed) { - retries++; - goto retry; - } + if (retries < max_proxy_retries && should_retry(jobs, num_jobs)) { + retries++; + goto retry; } return 0; |
