aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/Routes/Search.c73
-rw-r--r--src/Scraping/Scraping.c297
-rw-r--r--src/Scraping/Scraping.h21
3 files changed, 206 insertions, 185 deletions
diff --git a/src/Routes/Search.c b/src/Routes/Search.c
index 110e6f7..fcddfc2 100644
--- a/src/Routes/Search.c
+++ b/src/Routes/Search.c
@@ -12,19 +12,6 @@
#include <time.h>
typedef struct {
- const SearchEngine *engine;
- const char *query;
- SearchResult *results;
- int count;
-} EngineThreadData;
-
-static void *scrape_thread_func(void *arg) {
- EngineThreadData *data = (EngineThreadData *)arg;
- data->count = scrape_engine(data->engine, data->query, &data->results, 10);
- return NULL;
-}
-
-typedef struct {
const char *query;
InfoBox result;
int success;
@@ -109,7 +96,6 @@ int results_handler(UrlParams *params) {
}
char *encoded_query = strdup(raw_query);
-
char *display_query = url_decode_query(raw_query);
LOG_INFO("Processing search request for query: '%s'", display_query);
context_set(&ctx, "query", display_query);
@@ -130,18 +116,20 @@ int results_handler(UrlParams *params) {
pthread_create(&wiki_tid, NULL, wiki_thread_func, &wiki_data);
pthread_create(&calc_tid, NULL, calc_thread_func, &calc_data);
- pthread_t engine_tids[ENGINE_COUNT];
- EngineThreadData engine_data[ENGINE_COUNT];
+ ScrapeJob jobs[ENGINE_COUNT];
+ SearchResult *all_results[ENGINE_COUNT];
for (int i = 0; i < ENGINE_COUNT; i++) {
- engine_data[i].engine = &ENGINE_REGISTRY[i];
- engine_data[i].query = encoded_query;
-
- engine_data[i].results = NULL;
- engine_data[i].count = 0;
- pthread_create(&engine_tids[i], NULL, scrape_thread_func, &engine_data[i]);
+ all_results[i] = NULL;
+ jobs[i].engine = &ENGINE_REGISTRY[i];
+ jobs[i].query = encoded_query;
+ jobs[i].out_results = &all_results[i];
+ jobs[i].max_results = 10;
+ jobs[i].results_count = 0;
}
+ scrape_engines_parallel(jobs, ENGINE_COUNT);
+
pthread_join(wiki_tid, NULL);
pthread_join(calc_tid, NULL);
@@ -150,12 +138,14 @@ int results_handler(UrlParams *params) {
int infobox_count = 0;
if (calc_data.success) {
+ LOG_INFO("Calculator result available, adding to InfoBox");
infobox_count =
add_infobox_to_collection(&calc_data.result, &infobox_matrix,
&infobox_inner_counts, infobox_count);
}
if (wiki_data.success) {
+ LOG_INFO("Wikipedia result available, adding to InfoBox");
infobox_count =
add_infobox_to_collection(&wiki_data.result, &infobox_matrix,
&infobox_inner_counts, infobox_count);
@@ -172,8 +162,9 @@ int results_handler(UrlParams *params) {
int total_results = 0;
for (int i = 0; i < ENGINE_COUNT; i++) {
- pthread_join(engine_tids[i], NULL);
- total_results += engine_data[i].count;
+ total_results += jobs[i].results_count;
+ LOG_INFO("Engine %s returned %d results",
+ jobs[i].engine->name, jobs[i].results_count);
}
if (total_results > 0) {
@@ -183,8 +174,8 @@ int results_handler(UrlParams *params) {
int unique_count = 0;
for (int i = 0; i < ENGINE_COUNT; i++) {
- for (int j = 0; j < engine_data[i].count; j++) {
- char *raw_url = engine_data[i].results[j].url;
+ for (int j = 0; j < jobs[i].results_count; j++) {
+ char *raw_url = all_results[i][j].url;
char *clean_url = unescape_search_url(raw_url);
char *display_url = clean_url ? clean_url : raw_url;
@@ -198,9 +189,9 @@ int results_handler(UrlParams *params) {
if (is_duplicate) {
if (clean_url) free(clean_url);
- free(engine_data[i].results[j].url);
- free(engine_data[i].results[j].title);
- free(engine_data[i].results[j].snippet);
+ free(all_results[i][j].url);
+ free(all_results[i][j].title);
+ free(all_results[i][j].snippet);
continue;
}
@@ -211,27 +202,32 @@ int results_handler(UrlParams *params) {
results_matrix[unique_count][0] = strdup(display_url);
results_matrix[unique_count][1] = strdup(pretty_url);
results_matrix[unique_count][2] =
- engine_data[i].results[j].title
- ? strdup(engine_data[i].results[j].title)
+ all_results[i][j].title
+ ? strdup(all_results[i][j].title)
: strdup("Untitled");
results_matrix[unique_count][3] =
- engine_data[i].results[j].snippet
- ? strdup(engine_data[i].results[j].snippet)
+ all_results[i][j].snippet
+ ? strdup(all_results[i][j].snippet)
: strdup("");
results_inner_counts[unique_count] = 4;
free(pretty_url);
- free(engine_data[i].results[j].url);
- free(engine_data[i].results[j].title);
- free(engine_data[i].results[j].snippet);
+ free(all_results[i][j].url);
+ free(all_results[i][j].title);
+ free(all_results[i][j].snippet);
if (clean_url) free(clean_url);
unique_count++;
}
- free(engine_data[i].results);
+
+ if (all_results[i]) {
+ free(all_results[i]);
+ }
}
+ LOG_INFO("Deduplicated to %d unique results", unique_count);
+
context_set_array_of_arrays(&ctx, "results", results_matrix, unique_count,
results_inner_counts);
@@ -250,6 +246,7 @@ int results_handler(UrlParams *params) {
free(results_matrix);
free(results_inner_counts);
} else {
+ LOG_WARN("No search results found for query: '%s'", display_query);
char *html = render_template("results.html", &ctx);
if (html) {
send_response(html);
@@ -270,4 +267,4 @@ int results_handler(UrlParams *params) {
free_context(&ctx);
return 0;
-}
+} \ No newline at end of file
diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c
index d2afea6..c236b0f 100644
--- a/src/Scraping/Scraping.c
+++ b/src/Scraping/Scraping.c
@@ -8,23 +8,25 @@
#include <time.h>
#include <unistd.h>
-typedef struct {
- char *memory;
- size_t size;
-} MemoryBuffer;
-
static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb,
void *userp) {
size_t realsize = size * nmemb;
MemoryBuffer *mem = (MemoryBuffer *)userp;
- char *ptr = (char *)realloc(mem->memory, mem->size + realsize + 1);
- if (ptr == NULL) {
- LOG_ERROR("Not enough memory (realloc returned NULL)");
- return 0;
+ if (mem->size + realsize + 1 > mem->capacity) {
+
+ size_t new_cap = mem->capacity == 0 ? 16384 : mem->capacity * 2;
+ while (new_cap < mem->size + realsize + 1) new_cap *= 2;
+
+ char *ptr = (char *)realloc(mem->memory, new_cap);
+ if (!ptr) {
+ LOG_ERROR("Not enough memory (realloc returned NULL)");
+ return 0;
+ }
+ mem->memory = ptr;
+ mem->capacity = new_cap;
}
- mem->memory = ptr;
memcpy(&(mem->memory[mem->size]), contents, realsize);
mem->size += realsize;
mem->memory[mem->size] = 0;
@@ -70,8 +72,7 @@ static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc,
}
int num_links = xpathObj->nodesetval->nodeNr;
- LOG_INFO("[%s] XPath matched %d potential result links", engine_name,
- num_links);
+ LOG_INFO("[%s] XPath matched %d potential result links", engine_name, num_links);
int actual_alloc = (num_links < max_results) ? num_links : max_results;
*out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult));
@@ -98,25 +99,22 @@ static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc,
xmlStrcasecmp(snippetRow->name, (const xmlChar *)"tr") != 0)
snippetRow = snippetRow->next;
if (snippetRow) {
- xmlXPathContextPtr subCtx = xmlXPathNewContext(doc);
- if (subCtx) {
- subCtx->node = snippetRow;
- xmlXPathObjectPtr sObj = xmlXPathEvalExpression(
- (xmlChar *)".//td[@class='result-snippet']", subCtx);
- if (sObj && sObj->nodesetval && sObj->nodesetval->nodeNr > 0) {
- snippet_text =
- (char *)xmlNodeGetContent(sObj->nodesetval->nodeTab[0]);
- }
- if (sObj) xmlXPathFreeObject(sObj);
- xmlXPathFreeContext(subCtx);
+
+ xpathCtx->node = snippetRow;
+ xmlXPathObjectPtr sObj = xmlXPathEvalExpression(
+ (xmlChar *)".//td[@class='result-snippet']", xpathCtx);
+ if (sObj && sObj->nodesetval && sObj->nodesetval->nodeNr > 0) {
+ snippet_text = (char *)xmlNodeGetContent(sObj->nodesetval->nodeTab[0]);
}
+ if (sObj) xmlXPathFreeObject(sObj);
+ xpathCtx->node = NULL;
+
}
}
(*out_results)[found_count].url = strdup(url ? url : "");
(*out_results)[found_count].title = strdup(title ? title : "No Title");
- (*out_results)[found_count].snippet =
- strdup(snippet_text ? snippet_text : "");
+ (*out_results)[found_count].snippet = strdup(snippet_text ? snippet_text : "");
LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1,
(*out_results)[found_count].title);
@@ -168,16 +166,10 @@ static int parse_startpage(const char *engine_name, xmlDocPtr doc,
for (int i = 0; i < num_results && found_count < max_results; i++) {
xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i];
- xmlXPathContextPtr resCtx = xmlXPathNewContext(doc);
- if (!resCtx) {
- LOG_ERROR("[%s] Failed to create result context for item %d", engine_name,
- i);
- continue;
- }
- resCtx->node = resultNode;
+ xpathCtx->node = resultNode;
xmlXPathObjectPtr linkObj = xmlXPathEvalExpression(
- (xmlChar *)".//a[contains(@class, 'result-link')]", resCtx);
+ (xmlChar *)".//a[contains(@class, 'result-link')]", xpathCtx);
char *url =
(linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0)
? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0],
@@ -185,14 +177,14 @@ static int parse_startpage(const char *engine_name, xmlDocPtr doc,
: NULL;
xmlXPathObjectPtr titleObj = xmlXPathEvalExpression(
- (xmlChar *)".//h2[contains(@class, 'wgl-title')]", resCtx);
+ (xmlChar *)".//h2[contains(@class, 'wgl-title')]", xpathCtx);
char *title =
(titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0)
? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0])
: NULL;
xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression(
- (xmlChar *)".//p[contains(@class, 'description')]", resCtx);
+ (xmlChar *)".//p[contains(@class, 'description')]", xpathCtx);
char *snippet_text =
(snippetObj && snippetObj->nodesetval &&
snippetObj->nodesetval->nodeNr > 0)
@@ -204,8 +196,7 @@ static int parse_startpage(const char *engine_name, xmlDocPtr doc,
(*out_results)[found_count].title = strdup(title);
(*out_results)[found_count].snippet =
strdup(snippet_text ? snippet_text : "");
- LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1,
- title);
+ LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1, title);
found_count++;
} else {
LOG_WARN("[%s] Container %d missed URL or Title. URL: %s, Title: %s",
@@ -218,9 +209,10 @@ static int parse_startpage(const char *engine_name, xmlDocPtr doc,
if (linkObj) xmlXPathFreeObject(linkObj);
if (titleObj) xmlXPathFreeObject(titleObj);
if (snippetObj) xmlXPathFreeObject(snippetObj);
- xmlXPathFreeContext(resCtx);
}
+ xpathCtx->node = NULL;
+
xmlXPathFreeObject(xpathObj);
xmlXPathFreeContext(xpathCtx);
return found_count;
@@ -262,17 +254,11 @@ static int parse_yahoo(const char *engine_name, xmlDocPtr doc,
for (int i = 0; i < num_results && found_count < max_results; i++) {
xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i];
- xmlXPathContextPtr resCtx = xmlXPathNewContext(doc);
- if (!resCtx) {
- LOG_ERROR("[%s] Failed to create result context for item %d", engine_name,
- i);
- continue;
- }
- resCtx->node = resultNode;
+ xpathCtx->node = resultNode;
xmlXPathObjectPtr linkObj = xmlXPathEvalExpression(
(xmlChar *)".//div[contains(@class, 'compTitle')]//a[@target='_blank']",
- resCtx);
+ xpathCtx);
char *url =
(linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0)
? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0],
@@ -280,32 +266,26 @@ static int parse_yahoo(const char *engine_name, xmlDocPtr doc,
: NULL;
xmlXPathObjectPtr titleObj = xmlXPathEvalExpression(
- (xmlChar *)".//h3[contains(@class, 'title')]", resCtx);
+ (xmlChar *)".//h3[contains(@class, 'title')]", xpathCtx);
char *title =
(titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0)
? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0])
: NULL;
xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression(
- (xmlChar *)".//div[contains(@class, 'compText')]//p", resCtx);
+ (xmlChar *)".//div[contains(@class, 'compText')]//p", xpathCtx);
char *snippet_text =
(snippetObj && snippetObj->nodesetval &&
snippetObj->nodesetval->nodeNr > 0)
? (char *)xmlNodeGetContent(snippetObj->nodesetval->nodeTab[0])
: NULL;
- if (!url || !title) {
- LOG_DEBUG("[%s] Container %d debug - URL: %s, Title: %s", engine_name, i,
- url ? url : "(null)", title ? title : "(null)");
- }
-
if (url && title) {
(*out_results)[found_count].url = strdup(url);
(*out_results)[found_count].title = strdup(title);
(*out_results)[found_count].snippet =
strdup(snippet_text ? snippet_text : "");
- LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1,
- title);
+ LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1, title);
found_count++;
} else {
LOG_WARN("[%s] Container %d missed URL or Title. URL: %s, Title: %s",
@@ -318,9 +298,9 @@ static int parse_yahoo(const char *engine_name, xmlDocPtr doc,
if (linkObj) xmlXPathFreeObject(linkObj);
if (titleObj) xmlXPathFreeObject(titleObj);
if (snippetObj) xmlXPathFreeObject(snippetObj);
- xmlXPathFreeContext(resCtx);
}
+ xpathCtx->node = NULL;
xmlXPathFreeObject(xpathObj);
xmlXPathFreeContext(xpathCtx);
return found_count;
@@ -345,124 +325,149 @@ const SearchEngine ENGINE_REGISTRY[] = {
const int ENGINE_COUNT = sizeof(ENGINE_REGISTRY) / sizeof(SearchEngine);
-int scrape_engine(const SearchEngine *engine, const char *query,
- SearchResult **out_results, int max_results) {
- CURL *curl;
- MemoryBuffer chunk = {.memory = (char *)malloc(1), .size = 0};
- int results_count = 0;
+static void configure_curl_handle(CURL *curl, const char *full_url,
+ MemoryBuffer *chunk,
+ struct curl_slist *headers) {
+ curl_easy_setopt(curl, CURLOPT_URL, full_url);
+ curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
+ curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
+ curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)chunk);
+ curl_easy_setopt(curl, CURLOPT_USERAGENT, get_random_user_agent());
+
+ curl_easy_setopt(curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
+
+ curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "");
- LOG_INFO("--- Starting scrape for engine: %s ---", engine->name);
- LOG_INFO("[%s] Query: '%s'", engine->name, query);
+ curl_easy_setopt(curl, CURLOPT_DNS_CACHE_TIMEOUT, 300L);
+
+ curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+ curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L);
+ curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L);
+ curl_easy_setopt(curl, CURLOPT_COOKIEFILE, "");
+}
- if (!chunk.memory) {
- LOG_ERROR("Initial memory allocation failed");
+int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) {
+ CURLM *multi_handle = curl_multi_init();
+ if (!multi_handle) {
+ LOG_ERROR("Failed to initialize curl_multi");
return -1;
}
- curl = curl_easy_init();
+ for (int i = 0; i < num_jobs; i++) {
+ ScrapeJob *job = &jobs[i];
+ job->handle = curl_easy_init();
+ if (!job->handle) {
+ LOG_ERROR("[%s] Failed to init CURL handle", job->engine->name);
+ continue;
+ }
+
+ job->response.memory = (char *)malloc(16384);
+ job->response.size = 0;
+ job->response.capacity = 16384;
- if (curl && query) {
char full_url[1024];
- char *encoded_query = curl_easy_escape(curl, query, 0);
+ char *encoded_query = curl_easy_escape(job->handle, job->query, 0);
if (!encoded_query) {
- LOG_ERROR("[%s] Failed to encode query", engine->name);
- curl_easy_cleanup(curl);
- free(chunk.memory);
- return -1;
+ LOG_ERROR("[%s] Failed to encode query", job->engine->name);
+ curl_easy_cleanup(job->handle);
+ job->handle = NULL;
+ continue;
}
- snprintf(full_url, sizeof(full_url), "%s%s", engine->base_url,
- encoded_query);
+ snprintf(full_url, sizeof(full_url), "%s%s", job->engine->base_url, encoded_query);
curl_free(encoded_query);
- LOG_DEBUG("[%s] Requesting URL: %s", engine->name, full_url);
-
struct curl_slist *headers = NULL;
char host_buf[256], ref_buf[256];
- snprintf(host_buf, sizeof(host_buf), "Host: %s", engine->host_header);
- snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", engine->referer);
-
+ snprintf(host_buf, sizeof(host_buf), "Host: %s", job->engine->host_header);
+ snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", job->engine->referer);
headers = curl_slist_append(headers, host_buf);
headers = curl_slist_append(headers, ref_buf);
- headers = curl_slist_append(headers,
- "Accept: "
- "text/html,application/xhtml+xml,application/"
- "xml;q=0.9,image/avif,image/webp,*/*;q=0.8");
+ headers = curl_slist_append(headers, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5");
headers = curl_slist_append(headers, "DNT: 1");
- headers = curl_slist_append(headers, "Upgrade-Insecure-Requests: 1");
- headers = curl_slist_append(headers, "Sec-Fetch-Dest: document");
- headers = curl_slist_append(headers, "Sec-Fetch-Mode: navigate");
- headers = curl_slist_append(headers, "Sec-Fetch-Site: same-origin");
- headers = curl_slist_append(headers, "Connection: keep-alive");
- curl_easy_setopt(curl, CURLOPT_URL, full_url);
- curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
- curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
- curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&chunk);
+ configure_curl_handle(job->handle, full_url, &job->response, headers);
- const char *ua = get_random_user_agent();
- LOG_DEBUG("[%s] Using User-Agent: %s", engine->name, ua);
- curl_easy_setopt(curl, CURLOPT_USERAGENT, ua);
+ curl_easy_setopt(job->handle, CURLOPT_PRIVATE, headers);
- curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
- curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L);
- curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L);
+ curl_multi_add_handle(multi_handle, job->handle);
+ LOG_INFO("[%s] Added to parallel queue", job->engine->name);
+ }
- curl_easy_setopt(curl, CURLOPT_COOKIEFILE, "");
+ usleep(100000 + (rand() % 100000));
- LOG_DEBUG("[%s] Waiting for rate-limit jitter...", engine->name);
- usleep(500000 + (rand() % 1000000));
+ int still_running = 0;
+ curl_multi_perform(multi_handle, &still_running);
- CURLcode res = curl_easy_perform(curl);
+ do {
+ int numfds = 0;
+ CURLMcode mc = curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
- if (res != CURLE_OK) {
- LOG_ERROR("[%s] libcurl error: %s", engine->name,
- curl_easy_strerror(res));
- } else {
- long response_code;
- curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code);
- LOG_INFO("[%s] HTTP Response Code: %ld", engine->name, response_code);
-
- if (chunk.size > 0) {
- xmlDocPtr doc = htmlReadMemory(
- chunk.memory, chunk.size, NULL, NULL,
- HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
- if (doc) {
- results_count =
- engine->parser(engine->name, doc, out_results, max_results);
- xmlFreeDoc(doc);
- }
- }
+ if (mc != CURLM_OK) {
+ LOG_ERROR("curl_multi_wait() failed: %s", curl_multi_strerror(mc));
+ break;
}
- if (results_count <= 0) {
- LOG_WARN("[%s] No results found. Generating skeleton fallback.",
- engine->name);
- *out_results = (SearchResult *)malloc(sizeof(SearchResult));
- if (*out_results) {
- char fallback_msg[512];
- snprintf(fallback_msg, sizeof(fallback_msg),
- "Search %s manually for '%s'", engine->name, query);
-
- (*out_results)[0].title = strdup(fallback_msg);
- (*out_results)[0].url = strdup(full_url);
- (*out_results)[0].snippet = strdup(
- "Automated results were blocked by a Captcha or anti-bot "
- "challenge. Click the link above to perform the search "
- "manually in your browser.");
- results_count = 1;
- }
- }
+ curl_multi_perform(multi_handle, &still_running);
+ } while (still_running);
+
+ CURLMsg *msg;
+ int msgs_left;
+ while ((msg = curl_multi_info_read(multi_handle, &msgs_left))) {
+ if (msg->msg == CURLMSG_DONE) {
+ CURL *handle = msg->easy_handle;
+
+ for (int i = 0; i < num_jobs; i++) {
+ if (jobs[i].handle == handle) {
+ ScrapeJob *job = &jobs[i];
+
+ long response_code;
+ curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code);
+ LOG_INFO("[%s] HTTP Response Code: %ld", job->engine->name, response_code);
+
+ if (msg->data.result == CURLE_OK && job->response.size > 0) {
+ xmlDocPtr doc = htmlReadMemory(
+ job->response.memory, job->response.size, NULL, NULL,
+ HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
+
+ if (doc) {
+ job->results_count = job->engine->parser(
+ job->engine->name, doc, job->out_results, job->max_results);
+ xmlFreeDoc(doc);
+ }
+ } else {
+ LOG_ERROR("[%s] Request failed: %s", job->engine->name,
+ curl_easy_strerror(msg->data.result));
+ job->results_count = 0;
+ }
- curl_slist_free_all(headers);
- curl_easy_cleanup(curl);
- } else {
- if (curl) {
- curl_easy_cleanup(curl);
+ struct curl_slist *headers;
+ curl_easy_getinfo(handle, CURLINFO_PRIVATE, &headers);
+ if (headers) curl_slist_free_all(headers);
+
+ free(job->response.memory);
+ curl_multi_remove_handle(multi_handle, handle);
+ curl_easy_cleanup(handle);
+ break;
+ }
+ }
}
}
- free(chunk.memory);
-
- return results_count;
+ curl_multi_cleanup(multi_handle);
+ return 0;
}
+
+int scrape_engine(const SearchEngine *engine, const char *query,
+ SearchResult **out_results, int max_results) {
+ ScrapeJob job = {
+ .engine = engine,
+ .query = (char *)query,
+ .out_results = out_results,
+ .max_results = max_results,
+ .results_count = 0
+ };
+
+ scrape_engines_parallel(&job, 1);
+ return job.results_count;
+} \ No newline at end of file
diff --git a/src/Scraping/Scraping.h b/src/Scraping/Scraping.h
index 7ad4d59..43e22d9 100644
--- a/src/Scraping/Scraping.h
+++ b/src/Scraping/Scraping.h
@@ -2,6 +2,7 @@
#define SCRAPING_H
#include <libxml/HTMLparser.h>
+#include <curl/curl.h>
#define LOG_INFO(msg, ...) fprintf(stderr, "[INFO] " msg "\n", ##__VA_ARGS__)
#define LOG_WARN(msg, ...) fprintf(stderr, "[WARN] " msg "\n", ##__VA_ARGS__)
@@ -25,10 +26,28 @@ typedef struct {
ParserFunc parser;
} SearchEngine;
+typedef struct {
+ char *memory;
+ size_t size;
+ size_t capacity;
+} MemoryBuffer;
+
+typedef struct {
+ const SearchEngine *engine;
+ char *query;
+ SearchResult **out_results;
+ int max_results;
+ CURL *handle;
+ MemoryBuffer response;
+ int results_count;
+} ScrapeJob;
+
extern const SearchEngine ENGINE_REGISTRY[];
extern const int ENGINE_COUNT;
int scrape_engine(const SearchEngine *engine, const char *query,
SearchResult **out_results, int max_results);
-#endif
+int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs);
+
+#endif \ No newline at end of file