From 47f16db1909d185f7a6c5987226f64f0e2788262 Mon Sep 17 00:00:00 2001 From: frosty Date: Thu, 22 Jan 2026 12:57:27 -0500 Subject: scraping now more efficient blehhh --- src/Routes/Search.c | 73 ++++++------ src/Scraping/Scraping.c | 297 ++++++++++++++++++++++++------------------------ src/Scraping/Scraping.h | 21 +++- 3 files changed, 206 insertions(+), 185 deletions(-) (limited to 'src') diff --git a/src/Routes/Search.c b/src/Routes/Search.c index 110e6f7..fcddfc2 100644 --- a/src/Routes/Search.c +++ b/src/Routes/Search.c @@ -11,19 +11,6 @@ #include #include -typedef struct { - const SearchEngine *engine; - const char *query; - SearchResult *results; - int count; -} EngineThreadData; - -static void *scrape_thread_func(void *arg) { - EngineThreadData *data = (EngineThreadData *)arg; - data->count = scrape_engine(data->engine, data->query, &data->results, 10); - return NULL; -} - typedef struct { const char *query; InfoBox result; @@ -109,7 +96,6 @@ int results_handler(UrlParams *params) { } char *encoded_query = strdup(raw_query); - char *display_query = url_decode_query(raw_query); LOG_INFO("Processing search request for query: '%s'", display_query); context_set(&ctx, "query", display_query); @@ -130,18 +116,20 @@ int results_handler(UrlParams *params) { pthread_create(&wiki_tid, NULL, wiki_thread_func, &wiki_data); pthread_create(&calc_tid, NULL, calc_thread_func, &calc_data); - pthread_t engine_tids[ENGINE_COUNT]; - EngineThreadData engine_data[ENGINE_COUNT]; + ScrapeJob jobs[ENGINE_COUNT]; + SearchResult *all_results[ENGINE_COUNT]; for (int i = 0; i < ENGINE_COUNT; i++) { - engine_data[i].engine = &ENGINE_REGISTRY[i]; - engine_data[i].query = encoded_query; - - engine_data[i].results = NULL; - engine_data[i].count = 0; - pthread_create(&engine_tids[i], NULL, scrape_thread_func, &engine_data[i]); + all_results[i] = NULL; + jobs[i].engine = &ENGINE_REGISTRY[i]; + jobs[i].query = encoded_query; + jobs[i].out_results = &all_results[i]; + jobs[i].max_results = 10; + jobs[i].results_count = 0; } + scrape_engines_parallel(jobs, ENGINE_COUNT); + pthread_join(wiki_tid, NULL); pthread_join(calc_tid, NULL); @@ -150,12 +138,14 @@ int results_handler(UrlParams *params) { int infobox_count = 0; if (calc_data.success) { + LOG_INFO("Calculator result available, adding to InfoBox"); infobox_count = add_infobox_to_collection(&calc_data.result, &infobox_matrix, &infobox_inner_counts, infobox_count); } if (wiki_data.success) { + LOG_INFO("Wikipedia result available, adding to InfoBox"); infobox_count = add_infobox_to_collection(&wiki_data.result, &infobox_matrix, &infobox_inner_counts, infobox_count); @@ -172,8 +162,9 @@ int results_handler(UrlParams *params) { int total_results = 0; for (int i = 0; i < ENGINE_COUNT; i++) { - pthread_join(engine_tids[i], NULL); - total_results += engine_data[i].count; + total_results += jobs[i].results_count; + LOG_INFO("Engine %s returned %d results", + jobs[i].engine->name, jobs[i].results_count); } if (total_results > 0) { @@ -183,8 +174,8 @@ int results_handler(UrlParams *params) { int unique_count = 0; for (int i = 0; i < ENGINE_COUNT; i++) { - for (int j = 0; j < engine_data[i].count; j++) { - char *raw_url = engine_data[i].results[j].url; + for (int j = 0; j < jobs[i].results_count; j++) { + char *raw_url = all_results[i][j].url; char *clean_url = unescape_search_url(raw_url); char *display_url = clean_url ? clean_url : raw_url; @@ -198,9 +189,9 @@ int results_handler(UrlParams *params) { if (is_duplicate) { if (clean_url) free(clean_url); - free(engine_data[i].results[j].url); - free(engine_data[i].results[j].title); - free(engine_data[i].results[j].snippet); + free(all_results[i][j].url); + free(all_results[i][j].title); + free(all_results[i][j].snippet); continue; } @@ -211,27 +202,32 @@ int results_handler(UrlParams *params) { results_matrix[unique_count][0] = strdup(display_url); results_matrix[unique_count][1] = strdup(pretty_url); results_matrix[unique_count][2] = - engine_data[i].results[j].title - ? strdup(engine_data[i].results[j].title) + all_results[i][j].title + ? strdup(all_results[i][j].title) : strdup("Untitled"); results_matrix[unique_count][3] = - engine_data[i].results[j].snippet - ? strdup(engine_data[i].results[j].snippet) + all_results[i][j].snippet + ? strdup(all_results[i][j].snippet) : strdup(""); results_inner_counts[unique_count] = 4; free(pretty_url); - free(engine_data[i].results[j].url); - free(engine_data[i].results[j].title); - free(engine_data[i].results[j].snippet); + free(all_results[i][j].url); + free(all_results[i][j].title); + free(all_results[i][j].snippet); if (clean_url) free(clean_url); unique_count++; } - free(engine_data[i].results); + + if (all_results[i]) { + free(all_results[i]); + } } + LOG_INFO("Deduplicated to %d unique results", unique_count); + context_set_array_of_arrays(&ctx, "results", results_matrix, unique_count, results_inner_counts); @@ -250,6 +246,7 @@ int results_handler(UrlParams *params) { free(results_matrix); free(results_inner_counts); } else { + LOG_WARN("No search results found for query: '%s'", display_query); char *html = render_template("results.html", &ctx); if (html) { send_response(html); @@ -270,4 +267,4 @@ int results_handler(UrlParams *params) { free_context(&ctx); return 0; -} +} \ No newline at end of file diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c index d2afea6..c236b0f 100644 --- a/src/Scraping/Scraping.c +++ b/src/Scraping/Scraping.c @@ -8,23 +8,25 @@ #include #include -typedef struct { - char *memory; - size_t size; -} MemoryBuffer; - static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) { size_t realsize = size * nmemb; MemoryBuffer *mem = (MemoryBuffer *)userp; - char *ptr = (char *)realloc(mem->memory, mem->size + realsize + 1); - if (ptr == NULL) { - LOG_ERROR("Not enough memory (realloc returned NULL)"); - return 0; + if (mem->size + realsize + 1 > mem->capacity) { + + size_t new_cap = mem->capacity == 0 ? 16384 : mem->capacity * 2; + while (new_cap < mem->size + realsize + 1) new_cap *= 2; + + char *ptr = (char *)realloc(mem->memory, new_cap); + if (!ptr) { + LOG_ERROR("Not enough memory (realloc returned NULL)"); + return 0; + } + mem->memory = ptr; + mem->capacity = new_cap; } - mem->memory = ptr; memcpy(&(mem->memory[mem->size]), contents, realsize); mem->size += realsize; mem->memory[mem->size] = 0; @@ -70,8 +72,7 @@ static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc, } int num_links = xpathObj->nodesetval->nodeNr; - LOG_INFO("[%s] XPath matched %d potential result links", engine_name, - num_links); + LOG_INFO("[%s] XPath matched %d potential result links", engine_name, num_links); int actual_alloc = (num_links < max_results) ? num_links : max_results; *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult)); @@ -98,25 +99,22 @@ static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc, xmlStrcasecmp(snippetRow->name, (const xmlChar *)"tr") != 0) snippetRow = snippetRow->next; if (snippetRow) { - xmlXPathContextPtr subCtx = xmlXPathNewContext(doc); - if (subCtx) { - subCtx->node = snippetRow; - xmlXPathObjectPtr sObj = xmlXPathEvalExpression( - (xmlChar *)".//td[@class='result-snippet']", subCtx); - if (sObj && sObj->nodesetval && sObj->nodesetval->nodeNr > 0) { - snippet_text = - (char *)xmlNodeGetContent(sObj->nodesetval->nodeTab[0]); - } - if (sObj) xmlXPathFreeObject(sObj); - xmlXPathFreeContext(subCtx); + + xpathCtx->node = snippetRow; + xmlXPathObjectPtr sObj = xmlXPathEvalExpression( + (xmlChar *)".//td[@class='result-snippet']", xpathCtx); + if (sObj && sObj->nodesetval && sObj->nodesetval->nodeNr > 0) { + snippet_text = (char *)xmlNodeGetContent(sObj->nodesetval->nodeTab[0]); } + if (sObj) xmlXPathFreeObject(sObj); + xpathCtx->node = NULL; + } } (*out_results)[found_count].url = strdup(url ? url : ""); (*out_results)[found_count].title = strdup(title ? title : "No Title"); - (*out_results)[found_count].snippet = - strdup(snippet_text ? snippet_text : ""); + (*out_results)[found_count].snippet = strdup(snippet_text ? snippet_text : ""); LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1, (*out_results)[found_count].title); @@ -168,16 +166,10 @@ static int parse_startpage(const char *engine_name, xmlDocPtr doc, for (int i = 0; i < num_results && found_count < max_results; i++) { xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i]; - xmlXPathContextPtr resCtx = xmlXPathNewContext(doc); - if (!resCtx) { - LOG_ERROR("[%s] Failed to create result context for item %d", engine_name, - i); - continue; - } - resCtx->node = resultNode; + xpathCtx->node = resultNode; xmlXPathObjectPtr linkObj = xmlXPathEvalExpression( - (xmlChar *)".//a[contains(@class, 'result-link')]", resCtx); + (xmlChar *)".//a[contains(@class, 'result-link')]", xpathCtx); char *url = (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0) ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0], @@ -185,14 +177,14 @@ static int parse_startpage(const char *engine_name, xmlDocPtr doc, : NULL; xmlXPathObjectPtr titleObj = xmlXPathEvalExpression( - (xmlChar *)".//h2[contains(@class, 'wgl-title')]", resCtx); + (xmlChar *)".//h2[contains(@class, 'wgl-title')]", xpathCtx); char *title = (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0) ? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0]) : NULL; xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression( - (xmlChar *)".//p[contains(@class, 'description')]", resCtx); + (xmlChar *)".//p[contains(@class, 'description')]", xpathCtx); char *snippet_text = (snippetObj && snippetObj->nodesetval && snippetObj->nodesetval->nodeNr > 0) @@ -204,8 +196,7 @@ static int parse_startpage(const char *engine_name, xmlDocPtr doc, (*out_results)[found_count].title = strdup(title); (*out_results)[found_count].snippet = strdup(snippet_text ? snippet_text : ""); - LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1, - title); + LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1, title); found_count++; } else { LOG_WARN("[%s] Container %d missed URL or Title. URL: %s, Title: %s", @@ -218,9 +209,10 @@ static int parse_startpage(const char *engine_name, xmlDocPtr doc, if (linkObj) xmlXPathFreeObject(linkObj); if (titleObj) xmlXPathFreeObject(titleObj); if (snippetObj) xmlXPathFreeObject(snippetObj); - xmlXPathFreeContext(resCtx); } + xpathCtx->node = NULL; + xmlXPathFreeObject(xpathObj); xmlXPathFreeContext(xpathCtx); return found_count; @@ -262,17 +254,11 @@ static int parse_yahoo(const char *engine_name, xmlDocPtr doc, for (int i = 0; i < num_results && found_count < max_results; i++) { xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i]; - xmlXPathContextPtr resCtx = xmlXPathNewContext(doc); - if (!resCtx) { - LOG_ERROR("[%s] Failed to create result context for item %d", engine_name, - i); - continue; - } - resCtx->node = resultNode; + xpathCtx->node = resultNode; xmlXPathObjectPtr linkObj = xmlXPathEvalExpression( (xmlChar *)".//div[contains(@class, 'compTitle')]//a[@target='_blank']", - resCtx); + xpathCtx); char *url = (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0) ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0], @@ -280,32 +266,26 @@ static int parse_yahoo(const char *engine_name, xmlDocPtr doc, : NULL; xmlXPathObjectPtr titleObj = xmlXPathEvalExpression( - (xmlChar *)".//h3[contains(@class, 'title')]", resCtx); + (xmlChar *)".//h3[contains(@class, 'title')]", xpathCtx); char *title = (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0) ? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0]) : NULL; xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression( - (xmlChar *)".//div[contains(@class, 'compText')]//p", resCtx); + (xmlChar *)".//div[contains(@class, 'compText')]//p", xpathCtx); char *snippet_text = (snippetObj && snippetObj->nodesetval && snippetObj->nodesetval->nodeNr > 0) ? (char *)xmlNodeGetContent(snippetObj->nodesetval->nodeTab[0]) : NULL; - if (!url || !title) { - LOG_DEBUG("[%s] Container %d debug - URL: %s, Title: %s", engine_name, i, - url ? url : "(null)", title ? title : "(null)"); - } - if (url && title) { (*out_results)[found_count].url = strdup(url); (*out_results)[found_count].title = strdup(title); (*out_results)[found_count].snippet = strdup(snippet_text ? snippet_text : ""); - LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1, - title); + LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1, title); found_count++; } else { LOG_WARN("[%s] Container %d missed URL or Title. URL: %s, Title: %s", @@ -318,9 +298,9 @@ static int parse_yahoo(const char *engine_name, xmlDocPtr doc, if (linkObj) xmlXPathFreeObject(linkObj); if (titleObj) xmlXPathFreeObject(titleObj); if (snippetObj) xmlXPathFreeObject(snippetObj); - xmlXPathFreeContext(resCtx); } + xpathCtx->node = NULL; xmlXPathFreeObject(xpathObj); xmlXPathFreeContext(xpathCtx); return found_count; @@ -345,124 +325,149 @@ const SearchEngine ENGINE_REGISTRY[] = { const int ENGINE_COUNT = sizeof(ENGINE_REGISTRY) / sizeof(SearchEngine); -int scrape_engine(const SearchEngine *engine, const char *query, - SearchResult **out_results, int max_results) { - CURL *curl; - MemoryBuffer chunk = {.memory = (char *)malloc(1), .size = 0}; - int results_count = 0; +static void configure_curl_handle(CURL *curl, const char *full_url, + MemoryBuffer *chunk, + struct curl_slist *headers) { + curl_easy_setopt(curl, CURLOPT_URL, full_url); + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)chunk); + curl_easy_setopt(curl, CURLOPT_USERAGENT, get_random_user_agent()); + + curl_easy_setopt(curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, ""); - LOG_INFO("--- Starting scrape for engine: %s ---", engine->name); - LOG_INFO("[%s] Query: '%s'", engine->name, query); + curl_easy_setopt(curl, CURLOPT_DNS_CACHE_TIMEOUT, 300L); + + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L); + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L); + curl_easy_setopt(curl, CURLOPT_COOKIEFILE, ""); +} - if (!chunk.memory) { - LOG_ERROR("Initial memory allocation failed"); +int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) { + CURLM *multi_handle = curl_multi_init(); + if (!multi_handle) { + LOG_ERROR("Failed to initialize curl_multi"); return -1; } - curl = curl_easy_init(); + for (int i = 0; i < num_jobs; i++) { + ScrapeJob *job = &jobs[i]; + job->handle = curl_easy_init(); + if (!job->handle) { + LOG_ERROR("[%s] Failed to init CURL handle", job->engine->name); + continue; + } + + job->response.memory = (char *)malloc(16384); + job->response.size = 0; + job->response.capacity = 16384; - if (curl && query) { char full_url[1024]; - char *encoded_query = curl_easy_escape(curl, query, 0); + char *encoded_query = curl_easy_escape(job->handle, job->query, 0); if (!encoded_query) { - LOG_ERROR("[%s] Failed to encode query", engine->name); - curl_easy_cleanup(curl); - free(chunk.memory); - return -1; + LOG_ERROR("[%s] Failed to encode query", job->engine->name); + curl_easy_cleanup(job->handle); + job->handle = NULL; + continue; } - snprintf(full_url, sizeof(full_url), "%s%s", engine->base_url, - encoded_query); + snprintf(full_url, sizeof(full_url), "%s%s", job->engine->base_url, encoded_query); curl_free(encoded_query); - LOG_DEBUG("[%s] Requesting URL: %s", engine->name, full_url); - struct curl_slist *headers = NULL; char host_buf[256], ref_buf[256]; - snprintf(host_buf, sizeof(host_buf), "Host: %s", engine->host_header); - snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", engine->referer); - + snprintf(host_buf, sizeof(host_buf), "Host: %s", job->engine->host_header); + snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", job->engine->referer); headers = curl_slist_append(headers, host_buf); headers = curl_slist_append(headers, ref_buf); - headers = curl_slist_append(headers, - "Accept: " - "text/html,application/xhtml+xml,application/" - "xml;q=0.9,image/avif,image/webp,*/*;q=0.8"); + headers = curl_slist_append(headers, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5"); headers = curl_slist_append(headers, "DNT: 1"); - headers = curl_slist_append(headers, "Upgrade-Insecure-Requests: 1"); - headers = curl_slist_append(headers, "Sec-Fetch-Dest: document"); - headers = curl_slist_append(headers, "Sec-Fetch-Mode: navigate"); - headers = curl_slist_append(headers, "Sec-Fetch-Site: same-origin"); - headers = curl_slist_append(headers, "Connection: keep-alive"); - curl_easy_setopt(curl, CURLOPT_URL, full_url); - curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&chunk); + configure_curl_handle(job->handle, full_url, &job->response, headers); - const char *ua = get_random_user_agent(); - LOG_DEBUG("[%s] Using User-Agent: %s", engine->name, ua); - curl_easy_setopt(curl, CURLOPT_USERAGENT, ua); + curl_easy_setopt(job->handle, CURLOPT_PRIVATE, headers); - curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); - curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L); - curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L); + curl_multi_add_handle(multi_handle, job->handle); + LOG_INFO("[%s] Added to parallel queue", job->engine->name); + } - curl_easy_setopt(curl, CURLOPT_COOKIEFILE, ""); + usleep(100000 + (rand() % 100000)); - LOG_DEBUG("[%s] Waiting for rate-limit jitter...", engine->name); - usleep(500000 + (rand() % 1000000)); + int still_running = 0; + curl_multi_perform(multi_handle, &still_running); - CURLcode res = curl_easy_perform(curl); + do { + int numfds = 0; + CURLMcode mc = curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds); - if (res != CURLE_OK) { - LOG_ERROR("[%s] libcurl error: %s", engine->name, - curl_easy_strerror(res)); - } else { - long response_code; - curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code); - LOG_INFO("[%s] HTTP Response Code: %ld", engine->name, response_code); - - if (chunk.size > 0) { - xmlDocPtr doc = htmlReadMemory( - chunk.memory, chunk.size, NULL, NULL, - HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); - if (doc) { - results_count = - engine->parser(engine->name, doc, out_results, max_results); - xmlFreeDoc(doc); - } - } + if (mc != CURLM_OK) { + LOG_ERROR("curl_multi_wait() failed: %s", curl_multi_strerror(mc)); + break; } - if (results_count <= 0) { - LOG_WARN("[%s] No results found. Generating skeleton fallback.", - engine->name); - *out_results = (SearchResult *)malloc(sizeof(SearchResult)); - if (*out_results) { - char fallback_msg[512]; - snprintf(fallback_msg, sizeof(fallback_msg), - "Search %s manually for '%s'", engine->name, query); - - (*out_results)[0].title = strdup(fallback_msg); - (*out_results)[0].url = strdup(full_url); - (*out_results)[0].snippet = strdup( - "Automated results were blocked by a Captcha or anti-bot " - "challenge. Click the link above to perform the search " - "manually in your browser."); - results_count = 1; - } - } + curl_multi_perform(multi_handle, &still_running); + } while (still_running); + + CURLMsg *msg; + int msgs_left; + while ((msg = curl_multi_info_read(multi_handle, &msgs_left))) { + if (msg->msg == CURLMSG_DONE) { + CURL *handle = msg->easy_handle; + + for (int i = 0; i < num_jobs; i++) { + if (jobs[i].handle == handle) { + ScrapeJob *job = &jobs[i]; + + long response_code; + curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code); + LOG_INFO("[%s] HTTP Response Code: %ld", job->engine->name, response_code); + + if (msg->data.result == CURLE_OK && job->response.size > 0) { + xmlDocPtr doc = htmlReadMemory( + job->response.memory, job->response.size, NULL, NULL, + HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); + + if (doc) { + job->results_count = job->engine->parser( + job->engine->name, doc, job->out_results, job->max_results); + xmlFreeDoc(doc); + } + } else { + LOG_ERROR("[%s] Request failed: %s", job->engine->name, + curl_easy_strerror(msg->data.result)); + job->results_count = 0; + } - curl_slist_free_all(headers); - curl_easy_cleanup(curl); - } else { - if (curl) { - curl_easy_cleanup(curl); + struct curl_slist *headers; + curl_easy_getinfo(handle, CURLINFO_PRIVATE, &headers); + if (headers) curl_slist_free_all(headers); + + free(job->response.memory); + curl_multi_remove_handle(multi_handle, handle); + curl_easy_cleanup(handle); + break; + } + } } } - free(chunk.memory); - - return results_count; + curl_multi_cleanup(multi_handle); + return 0; } + +int scrape_engine(const SearchEngine *engine, const char *query, + SearchResult **out_results, int max_results) { + ScrapeJob job = { + .engine = engine, + .query = (char *)query, + .out_results = out_results, + .max_results = max_results, + .results_count = 0 + }; + + scrape_engines_parallel(&job, 1); + return job.results_count; +} \ No newline at end of file diff --git a/src/Scraping/Scraping.h b/src/Scraping/Scraping.h index 7ad4d59..43e22d9 100644 --- a/src/Scraping/Scraping.h +++ b/src/Scraping/Scraping.h @@ -2,6 +2,7 @@ #define SCRAPING_H #include +#include #define LOG_INFO(msg, ...) fprintf(stderr, "[INFO] " msg "\n", ##__VA_ARGS__) #define LOG_WARN(msg, ...) fprintf(stderr, "[WARN] " msg "\n", ##__VA_ARGS__) @@ -25,10 +26,28 @@ typedef struct { ParserFunc parser; } SearchEngine; +typedef struct { + char *memory; + size_t size; + size_t capacity; +} MemoryBuffer; + +typedef struct { + const SearchEngine *engine; + char *query; + SearchResult **out_results; + int max_results; + CURL *handle; + MemoryBuffer response; + int results_count; +} ScrapeJob; + extern const SearchEngine ENGINE_REGISTRY[]; extern const int ENGINE_COUNT; int scrape_engine(const SearchEngine *engine, const char *query, SearchResult **out_results, int max_results); -#endif +int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs); + +#endif \ No newline at end of file -- cgit v1.2.3