From 47f16db1909d185f7a6c5987226f64f0e2788262 Mon Sep 17 00:00:00 2001 From: frosty Date: Thu, 22 Jan 2026 12:57:27 -0500 Subject: scraping now more efficient blehhh --- src/Routes/Search.c | 73 +++++++++++++++++++++++++---------------------------- 1 file changed, 35 insertions(+), 38 deletions(-) (limited to 'src/Routes') diff --git a/src/Routes/Search.c b/src/Routes/Search.c index 110e6f7..fcddfc2 100644 --- a/src/Routes/Search.c +++ b/src/Routes/Search.c @@ -11,19 +11,6 @@ #include #include -typedef struct { - const SearchEngine *engine; - const char *query; - SearchResult *results; - int count; -} EngineThreadData; - -static void *scrape_thread_func(void *arg) { - EngineThreadData *data = (EngineThreadData *)arg; - data->count = scrape_engine(data->engine, data->query, &data->results, 10); - return NULL; -} - typedef struct { const char *query; InfoBox result; @@ -109,7 +96,6 @@ int results_handler(UrlParams *params) { } char *encoded_query = strdup(raw_query); - char *display_query = url_decode_query(raw_query); LOG_INFO("Processing search request for query: '%s'", display_query); context_set(&ctx, "query", display_query); @@ -130,18 +116,20 @@ int results_handler(UrlParams *params) { pthread_create(&wiki_tid, NULL, wiki_thread_func, &wiki_data); pthread_create(&calc_tid, NULL, calc_thread_func, &calc_data); - pthread_t engine_tids[ENGINE_COUNT]; - EngineThreadData engine_data[ENGINE_COUNT]; + ScrapeJob jobs[ENGINE_COUNT]; + SearchResult *all_results[ENGINE_COUNT]; for (int i = 0; i < ENGINE_COUNT; i++) { - engine_data[i].engine = &ENGINE_REGISTRY[i]; - engine_data[i].query = encoded_query; - - engine_data[i].results = NULL; - engine_data[i].count = 0; - pthread_create(&engine_tids[i], NULL, scrape_thread_func, &engine_data[i]); + all_results[i] = NULL; + jobs[i].engine = &ENGINE_REGISTRY[i]; + jobs[i].query = encoded_query; + jobs[i].out_results = &all_results[i]; + jobs[i].max_results = 10; + jobs[i].results_count = 0; } + scrape_engines_parallel(jobs, ENGINE_COUNT); + pthread_join(wiki_tid, NULL); pthread_join(calc_tid, NULL); @@ -150,12 +138,14 @@ int results_handler(UrlParams *params) { int infobox_count = 0; if (calc_data.success) { + LOG_INFO("Calculator result available, adding to InfoBox"); infobox_count = add_infobox_to_collection(&calc_data.result, &infobox_matrix, &infobox_inner_counts, infobox_count); } if (wiki_data.success) { + LOG_INFO("Wikipedia result available, adding to InfoBox"); infobox_count = add_infobox_to_collection(&wiki_data.result, &infobox_matrix, &infobox_inner_counts, infobox_count); @@ -172,8 +162,9 @@ int results_handler(UrlParams *params) { int total_results = 0; for (int i = 0; i < ENGINE_COUNT; i++) { - pthread_join(engine_tids[i], NULL); - total_results += engine_data[i].count; + total_results += jobs[i].results_count; + LOG_INFO("Engine %s returned %d results", + jobs[i].engine->name, jobs[i].results_count); } if (total_results > 0) { @@ -183,8 +174,8 @@ int results_handler(UrlParams *params) { int unique_count = 0; for (int i = 0; i < ENGINE_COUNT; i++) { - for (int j = 0; j < engine_data[i].count; j++) { - char *raw_url = engine_data[i].results[j].url; + for (int j = 0; j < jobs[i].results_count; j++) { + char *raw_url = all_results[i][j].url; char *clean_url = unescape_search_url(raw_url); char *display_url = clean_url ? clean_url : raw_url; @@ -198,9 +189,9 @@ int results_handler(UrlParams *params) { if (is_duplicate) { if (clean_url) free(clean_url); - free(engine_data[i].results[j].url); - free(engine_data[i].results[j].title); - free(engine_data[i].results[j].snippet); + free(all_results[i][j].url); + free(all_results[i][j].title); + free(all_results[i][j].snippet); continue; } @@ -211,27 +202,32 @@ int results_handler(UrlParams *params) { results_matrix[unique_count][0] = strdup(display_url); results_matrix[unique_count][1] = strdup(pretty_url); results_matrix[unique_count][2] = - engine_data[i].results[j].title - ? strdup(engine_data[i].results[j].title) + all_results[i][j].title + ? strdup(all_results[i][j].title) : strdup("Untitled"); results_matrix[unique_count][3] = - engine_data[i].results[j].snippet - ? strdup(engine_data[i].results[j].snippet) + all_results[i][j].snippet + ? strdup(all_results[i][j].snippet) : strdup(""); results_inner_counts[unique_count] = 4; free(pretty_url); - free(engine_data[i].results[j].url); - free(engine_data[i].results[j].title); - free(engine_data[i].results[j].snippet); + free(all_results[i][j].url); + free(all_results[i][j].title); + free(all_results[i][j].snippet); if (clean_url) free(clean_url); unique_count++; } - free(engine_data[i].results); + + if (all_results[i]) { + free(all_results[i]); + } } + LOG_INFO("Deduplicated to %d unique results", unique_count); + context_set_array_of_arrays(&ctx, "results", results_matrix, unique_count, results_inner_counts); @@ -250,6 +246,7 @@ int results_handler(UrlParams *params) { free(results_matrix); free(results_inner_counts); } else { + LOG_WARN("No search results found for query: '%s'", display_query); char *html = render_template("results.html", &ctx); if (html) { send_response(html); @@ -270,4 +267,4 @@ int results_handler(UrlParams *params) { free_context(&ctx); return 0; -} +} \ No newline at end of file -- cgit v1.2.3