From 47f16db1909d185f7a6c5987226f64f0e2788262 Mon Sep 17 00:00:00 2001 From: frosty Date: Thu, 22 Jan 2026 12:57:27 -0500 Subject: scraping now more efficient blehhh --- src/Scraping/Scraping.c | 297 ++++++++++++++++++++++++------------------------ src/Scraping/Scraping.h | 21 +++- 2 files changed, 171 insertions(+), 147 deletions(-) (limited to 'src/Scraping') diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c index d2afea6..c236b0f 100644 --- a/src/Scraping/Scraping.c +++ b/src/Scraping/Scraping.c @@ -8,23 +8,25 @@ #include #include -typedef struct { - char *memory; - size_t size; -} MemoryBuffer; - static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) { size_t realsize = size * nmemb; MemoryBuffer *mem = (MemoryBuffer *)userp; - char *ptr = (char *)realloc(mem->memory, mem->size + realsize + 1); - if (ptr == NULL) { - LOG_ERROR("Not enough memory (realloc returned NULL)"); - return 0; + if (mem->size + realsize + 1 > mem->capacity) { + + size_t new_cap = mem->capacity == 0 ? 16384 : mem->capacity * 2; + while (new_cap < mem->size + realsize + 1) new_cap *= 2; + + char *ptr = (char *)realloc(mem->memory, new_cap); + if (!ptr) { + LOG_ERROR("Not enough memory (realloc returned NULL)"); + return 0; + } + mem->memory = ptr; + mem->capacity = new_cap; } - mem->memory = ptr; memcpy(&(mem->memory[mem->size]), contents, realsize); mem->size += realsize; mem->memory[mem->size] = 0; @@ -70,8 +72,7 @@ static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc, } int num_links = xpathObj->nodesetval->nodeNr; - LOG_INFO("[%s] XPath matched %d potential result links", engine_name, - num_links); + LOG_INFO("[%s] XPath matched %d potential result links", engine_name, num_links); int actual_alloc = (num_links < max_results) ? num_links : max_results; *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult)); @@ -98,25 +99,22 @@ static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc, xmlStrcasecmp(snippetRow->name, (const xmlChar *)"tr") != 0) snippetRow = snippetRow->next; if (snippetRow) { - xmlXPathContextPtr subCtx = xmlXPathNewContext(doc); - if (subCtx) { - subCtx->node = snippetRow; - xmlXPathObjectPtr sObj = xmlXPathEvalExpression( - (xmlChar *)".//td[@class='result-snippet']", subCtx); - if (sObj && sObj->nodesetval && sObj->nodesetval->nodeNr > 0) { - snippet_text = - (char *)xmlNodeGetContent(sObj->nodesetval->nodeTab[0]); - } - if (sObj) xmlXPathFreeObject(sObj); - xmlXPathFreeContext(subCtx); + + xpathCtx->node = snippetRow; + xmlXPathObjectPtr sObj = xmlXPathEvalExpression( + (xmlChar *)".//td[@class='result-snippet']", xpathCtx); + if (sObj && sObj->nodesetval && sObj->nodesetval->nodeNr > 0) { + snippet_text = (char *)xmlNodeGetContent(sObj->nodesetval->nodeTab[0]); } + if (sObj) xmlXPathFreeObject(sObj); + xpathCtx->node = NULL; + } } (*out_results)[found_count].url = strdup(url ? url : ""); (*out_results)[found_count].title = strdup(title ? title : "No Title"); - (*out_results)[found_count].snippet = - strdup(snippet_text ? snippet_text : ""); + (*out_results)[found_count].snippet = strdup(snippet_text ? snippet_text : ""); LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1, (*out_results)[found_count].title); @@ -168,16 +166,10 @@ static int parse_startpage(const char *engine_name, xmlDocPtr doc, for (int i = 0; i < num_results && found_count < max_results; i++) { xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i]; - xmlXPathContextPtr resCtx = xmlXPathNewContext(doc); - if (!resCtx) { - LOG_ERROR("[%s] Failed to create result context for item %d", engine_name, - i); - continue; - } - resCtx->node = resultNode; + xpathCtx->node = resultNode; xmlXPathObjectPtr linkObj = xmlXPathEvalExpression( - (xmlChar *)".//a[contains(@class, 'result-link')]", resCtx); + (xmlChar *)".//a[contains(@class, 'result-link')]", xpathCtx); char *url = (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0) ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0], @@ -185,14 +177,14 @@ static int parse_startpage(const char *engine_name, xmlDocPtr doc, : NULL; xmlXPathObjectPtr titleObj = xmlXPathEvalExpression( - (xmlChar *)".//h2[contains(@class, 'wgl-title')]", resCtx); + (xmlChar *)".//h2[contains(@class, 'wgl-title')]", xpathCtx); char *title = (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0) ? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0]) : NULL; xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression( - (xmlChar *)".//p[contains(@class, 'description')]", resCtx); + (xmlChar *)".//p[contains(@class, 'description')]", xpathCtx); char *snippet_text = (snippetObj && snippetObj->nodesetval && snippetObj->nodesetval->nodeNr > 0) @@ -204,8 +196,7 @@ static int parse_startpage(const char *engine_name, xmlDocPtr doc, (*out_results)[found_count].title = strdup(title); (*out_results)[found_count].snippet = strdup(snippet_text ? snippet_text : ""); - LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1, - title); + LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1, title); found_count++; } else { LOG_WARN("[%s] Container %d missed URL or Title. URL: %s, Title: %s", @@ -218,9 +209,10 @@ static int parse_startpage(const char *engine_name, xmlDocPtr doc, if (linkObj) xmlXPathFreeObject(linkObj); if (titleObj) xmlXPathFreeObject(titleObj); if (snippetObj) xmlXPathFreeObject(snippetObj); - xmlXPathFreeContext(resCtx); } + xpathCtx->node = NULL; + xmlXPathFreeObject(xpathObj); xmlXPathFreeContext(xpathCtx); return found_count; @@ -262,17 +254,11 @@ static int parse_yahoo(const char *engine_name, xmlDocPtr doc, for (int i = 0; i < num_results && found_count < max_results; i++) { xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i]; - xmlXPathContextPtr resCtx = xmlXPathNewContext(doc); - if (!resCtx) { - LOG_ERROR("[%s] Failed to create result context for item %d", engine_name, - i); - continue; - } - resCtx->node = resultNode; + xpathCtx->node = resultNode; xmlXPathObjectPtr linkObj = xmlXPathEvalExpression( (xmlChar *)".//div[contains(@class, 'compTitle')]//a[@target='_blank']", - resCtx); + xpathCtx); char *url = (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0) ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0], @@ -280,32 +266,26 @@ static int parse_yahoo(const char *engine_name, xmlDocPtr doc, : NULL; xmlXPathObjectPtr titleObj = xmlXPathEvalExpression( - (xmlChar *)".//h3[contains(@class, 'title')]", resCtx); + (xmlChar *)".//h3[contains(@class, 'title')]", xpathCtx); char *title = (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0) ? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0]) : NULL; xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression( - (xmlChar *)".//div[contains(@class, 'compText')]//p", resCtx); + (xmlChar *)".//div[contains(@class, 'compText')]//p", xpathCtx); char *snippet_text = (snippetObj && snippetObj->nodesetval && snippetObj->nodesetval->nodeNr > 0) ? (char *)xmlNodeGetContent(snippetObj->nodesetval->nodeTab[0]) : NULL; - if (!url || !title) { - LOG_DEBUG("[%s] Container %d debug - URL: %s, Title: %s", engine_name, i, - url ? url : "(null)", title ? title : "(null)"); - } - if (url && title) { (*out_results)[found_count].url = strdup(url); (*out_results)[found_count].title = strdup(title); (*out_results)[found_count].snippet = strdup(snippet_text ? snippet_text : ""); - LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1, - title); + LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1, title); found_count++; } else { LOG_WARN("[%s] Container %d missed URL or Title. URL: %s, Title: %s", @@ -318,9 +298,9 @@ static int parse_yahoo(const char *engine_name, xmlDocPtr doc, if (linkObj) xmlXPathFreeObject(linkObj); if (titleObj) xmlXPathFreeObject(titleObj); if (snippetObj) xmlXPathFreeObject(snippetObj); - xmlXPathFreeContext(resCtx); } + xpathCtx->node = NULL; xmlXPathFreeObject(xpathObj); xmlXPathFreeContext(xpathCtx); return found_count; @@ -345,124 +325,149 @@ const SearchEngine ENGINE_REGISTRY[] = { const int ENGINE_COUNT = sizeof(ENGINE_REGISTRY) / sizeof(SearchEngine); -int scrape_engine(const SearchEngine *engine, const char *query, - SearchResult **out_results, int max_results) { - CURL *curl; - MemoryBuffer chunk = {.memory = (char *)malloc(1), .size = 0}; - int results_count = 0; +static void configure_curl_handle(CURL *curl, const char *full_url, + MemoryBuffer *chunk, + struct curl_slist *headers) { + curl_easy_setopt(curl, CURLOPT_URL, full_url); + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)chunk); + curl_easy_setopt(curl, CURLOPT_USERAGENT, get_random_user_agent()); + + curl_easy_setopt(curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, ""); - LOG_INFO("--- Starting scrape for engine: %s ---", engine->name); - LOG_INFO("[%s] Query: '%s'", engine->name, query); + curl_easy_setopt(curl, CURLOPT_DNS_CACHE_TIMEOUT, 300L); + + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L); + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L); + curl_easy_setopt(curl, CURLOPT_COOKIEFILE, ""); +} - if (!chunk.memory) { - LOG_ERROR("Initial memory allocation failed"); +int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) { + CURLM *multi_handle = curl_multi_init(); + if (!multi_handle) { + LOG_ERROR("Failed to initialize curl_multi"); return -1; } - curl = curl_easy_init(); + for (int i = 0; i < num_jobs; i++) { + ScrapeJob *job = &jobs[i]; + job->handle = curl_easy_init(); + if (!job->handle) { + LOG_ERROR("[%s] Failed to init CURL handle", job->engine->name); + continue; + } + + job->response.memory = (char *)malloc(16384); + job->response.size = 0; + job->response.capacity = 16384; - if (curl && query) { char full_url[1024]; - char *encoded_query = curl_easy_escape(curl, query, 0); + char *encoded_query = curl_easy_escape(job->handle, job->query, 0); if (!encoded_query) { - LOG_ERROR("[%s] Failed to encode query", engine->name); - curl_easy_cleanup(curl); - free(chunk.memory); - return -1; + LOG_ERROR("[%s] Failed to encode query", job->engine->name); + curl_easy_cleanup(job->handle); + job->handle = NULL; + continue; } - snprintf(full_url, sizeof(full_url), "%s%s", engine->base_url, - encoded_query); + snprintf(full_url, sizeof(full_url), "%s%s", job->engine->base_url, encoded_query); curl_free(encoded_query); - LOG_DEBUG("[%s] Requesting URL: %s", engine->name, full_url); - struct curl_slist *headers = NULL; char host_buf[256], ref_buf[256]; - snprintf(host_buf, sizeof(host_buf), "Host: %s", engine->host_header); - snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", engine->referer); - + snprintf(host_buf, sizeof(host_buf), "Host: %s", job->engine->host_header); + snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", job->engine->referer); headers = curl_slist_append(headers, host_buf); headers = curl_slist_append(headers, ref_buf); - headers = curl_slist_append(headers, - "Accept: " - "text/html,application/xhtml+xml,application/" - "xml;q=0.9,image/avif,image/webp,*/*;q=0.8"); + headers = curl_slist_append(headers, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5"); headers = curl_slist_append(headers, "DNT: 1"); - headers = curl_slist_append(headers, "Upgrade-Insecure-Requests: 1"); - headers = curl_slist_append(headers, "Sec-Fetch-Dest: document"); - headers = curl_slist_append(headers, "Sec-Fetch-Mode: navigate"); - headers = curl_slist_append(headers, "Sec-Fetch-Site: same-origin"); - headers = curl_slist_append(headers, "Connection: keep-alive"); - curl_easy_setopt(curl, CURLOPT_URL, full_url); - curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&chunk); + configure_curl_handle(job->handle, full_url, &job->response, headers); - const char *ua = get_random_user_agent(); - LOG_DEBUG("[%s] Using User-Agent: %s", engine->name, ua); - curl_easy_setopt(curl, CURLOPT_USERAGENT, ua); + curl_easy_setopt(job->handle, CURLOPT_PRIVATE, headers); - curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); - curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L); - curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L); + curl_multi_add_handle(multi_handle, job->handle); + LOG_INFO("[%s] Added to parallel queue", job->engine->name); + } - curl_easy_setopt(curl, CURLOPT_COOKIEFILE, ""); + usleep(100000 + (rand() % 100000)); - LOG_DEBUG("[%s] Waiting for rate-limit jitter...", engine->name); - usleep(500000 + (rand() % 1000000)); + int still_running = 0; + curl_multi_perform(multi_handle, &still_running); - CURLcode res = curl_easy_perform(curl); + do { + int numfds = 0; + CURLMcode mc = curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds); - if (res != CURLE_OK) { - LOG_ERROR("[%s] libcurl error: %s", engine->name, - curl_easy_strerror(res)); - } else { - long response_code; - curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code); - LOG_INFO("[%s] HTTP Response Code: %ld", engine->name, response_code); - - if (chunk.size > 0) { - xmlDocPtr doc = htmlReadMemory( - chunk.memory, chunk.size, NULL, NULL, - HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); - if (doc) { - results_count = - engine->parser(engine->name, doc, out_results, max_results); - xmlFreeDoc(doc); - } - } + if (mc != CURLM_OK) { + LOG_ERROR("curl_multi_wait() failed: %s", curl_multi_strerror(mc)); + break; } - if (results_count <= 0) { - LOG_WARN("[%s] No results found. Generating skeleton fallback.", - engine->name); - *out_results = (SearchResult *)malloc(sizeof(SearchResult)); - if (*out_results) { - char fallback_msg[512]; - snprintf(fallback_msg, sizeof(fallback_msg), - "Search %s manually for '%s'", engine->name, query); - - (*out_results)[0].title = strdup(fallback_msg); - (*out_results)[0].url = strdup(full_url); - (*out_results)[0].snippet = strdup( - "Automated results were blocked by a Captcha or anti-bot " - "challenge. Click the link above to perform the search " - "manually in your browser."); - results_count = 1; - } - } + curl_multi_perform(multi_handle, &still_running); + } while (still_running); + + CURLMsg *msg; + int msgs_left; + while ((msg = curl_multi_info_read(multi_handle, &msgs_left))) { + if (msg->msg == CURLMSG_DONE) { + CURL *handle = msg->easy_handle; + + for (int i = 0; i < num_jobs; i++) { + if (jobs[i].handle == handle) { + ScrapeJob *job = &jobs[i]; + + long response_code; + curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code); + LOG_INFO("[%s] HTTP Response Code: %ld", job->engine->name, response_code); + + if (msg->data.result == CURLE_OK && job->response.size > 0) { + xmlDocPtr doc = htmlReadMemory( + job->response.memory, job->response.size, NULL, NULL, + HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); + + if (doc) { + job->results_count = job->engine->parser( + job->engine->name, doc, job->out_results, job->max_results); + xmlFreeDoc(doc); + } + } else { + LOG_ERROR("[%s] Request failed: %s", job->engine->name, + curl_easy_strerror(msg->data.result)); + job->results_count = 0; + } - curl_slist_free_all(headers); - curl_easy_cleanup(curl); - } else { - if (curl) { - curl_easy_cleanup(curl); + struct curl_slist *headers; + curl_easy_getinfo(handle, CURLINFO_PRIVATE, &headers); + if (headers) curl_slist_free_all(headers); + + free(job->response.memory); + curl_multi_remove_handle(multi_handle, handle); + curl_easy_cleanup(handle); + break; + } + } } } - free(chunk.memory); - - return results_count; + curl_multi_cleanup(multi_handle); + return 0; } + +int scrape_engine(const SearchEngine *engine, const char *query, + SearchResult **out_results, int max_results) { + ScrapeJob job = { + .engine = engine, + .query = (char *)query, + .out_results = out_results, + .max_results = max_results, + .results_count = 0 + }; + + scrape_engines_parallel(&job, 1); + return job.results_count; +} \ No newline at end of file diff --git a/src/Scraping/Scraping.h b/src/Scraping/Scraping.h index 7ad4d59..43e22d9 100644 --- a/src/Scraping/Scraping.h +++ b/src/Scraping/Scraping.h @@ -2,6 +2,7 @@ #define SCRAPING_H #include +#include #define LOG_INFO(msg, ...) fprintf(stderr, "[INFO] " msg "\n", ##__VA_ARGS__) #define LOG_WARN(msg, ...) fprintf(stderr, "[WARN] " msg "\n", ##__VA_ARGS__) @@ -25,10 +26,28 @@ typedef struct { ParserFunc parser; } SearchEngine; +typedef struct { + char *memory; + size_t size; + size_t capacity; +} MemoryBuffer; + +typedef struct { + const SearchEngine *engine; + char *query; + SearchResult **out_results; + int max_results; + CURL *handle; + MemoryBuffer response; + int results_count; +} ScrapeJob; + extern const SearchEngine ENGINE_REGISTRY[]; extern const int ENGINE_COUNT; int scrape_engine(const SearchEngine *engine, const char *query, SearchResult **out_results, int max_results); -#endif +int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs); + +#endif \ No newline at end of file -- cgit v1.2.3