diff options
| author | frosty <gabriel@bwaaa.monster> | 2026-02-27 18:32:23 -0500 |
|---|---|---|
| committer | frosty <gabriel@bwaaa.monster> | 2026-02-27 18:32:23 -0500 |
| commit | 9f2cd561286784fd000eb8a00f1f80db3185062c (patch) | |
| tree | 14216b6d50b34bab1c7f7ae70d628d3560613f9e /src/Scraping | |
| parent | 26e3403e039d1a80f2e62f8efe889ad5f40c8cee (diff) | |
| download | omnisearch-9f2cd561286784fd000eb8a00f1f80db3185062c.tar.gz | |
added proxying
Diffstat (limited to 'src/Scraping')
| -rw-r--r-- | src/Scraping/Scraping.c | 36 | ||||
| -rw-r--r-- | src/Scraping/Scraping.h | 7 |
2 files changed, 35 insertions, 8 deletions
diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c index 42e05d6..5b1b5d6 100644 --- a/src/Scraping/Scraping.c +++ b/src/Scraping/Scraping.c @@ -1,4 +1,5 @@ #include "Scraping.h" +#include "../Proxy/Proxy.h" #include "../Utility/Unescape.h" #include <curl/curl.h> #include <libxml/HTMLparser.h> @@ -329,9 +330,14 @@ static void configure_curl_handle(CURL *curl, const char *full_url, curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L); curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L); curl_easy_setopt(curl, CURLOPT_COOKIEFILE, ""); + + apply_proxy_settings(curl); } int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) { + int retries = 0; + +retry: CURLM *multi_handle = curl_multi_init(); if (!multi_handle) { return -1; @@ -339,6 +345,15 @@ int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) { for (int i = 0; i < num_jobs; i++) { ScrapeJob *job = &jobs[i]; + + if (job->handle) { + curl_easy_cleanup(job->handle); + job->handle = NULL; + } + if (job->response.memory) { + free(job->response.memory); + } + job->handle = curl_easy_init(); if (!job->handle) { continue; @@ -406,7 +421,7 @@ int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) { CURL *handle = msg->easy_handle; for (int i = 0; i < num_jobs; i++) { - if (jobs[i].handle == handle) { + if (jobs[i].handle && jobs[i].handle == handle) { ScrapeJob *job = &jobs[i]; long response_code; @@ -431,8 +446,10 @@ int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) { if (headers) curl_slist_free_all(headers); free(job->response.memory); + job->response.memory = NULL; curl_multi_remove_handle(multi_handle, handle); - curl_easy_cleanup(handle); + if (handle) curl_easy_cleanup(handle); + job->handle = NULL; break; } } @@ -440,6 +457,21 @@ int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) { } curl_multi_cleanup(multi_handle); + + if (retries < max_proxy_retries && proxy_count > 0) { + int any_failed = 0; + for (int i = 0; i < num_jobs; i++) { + if (jobs[i].results_count == 0 && jobs[i].response.size == 0) { + any_failed = 1; + break; + } + } + if (any_failed) { + retries++; + goto retry; + } + } + return 0; } diff --git a/src/Scraping/Scraping.h b/src/Scraping/Scraping.h index d8a3b13..0865825 100644 --- a/src/Scraping/Scraping.h +++ b/src/Scraping/Scraping.h @@ -4,11 +4,6 @@ #include <libxml/HTMLparser.h> #include <curl/curl.h> -#define LOG_INFO(msg, ...) fprintf(stderr, "[INFO] " msg "\n", ##__VA_ARGS__) -#define LOG_WARN(msg, ...) fprintf(stderr, "[WARN] " msg "\n", ##__VA_ARGS__) -#define LOG_DEBUG(msg, ...) fprintf(stderr, "[DEBUG] " msg "\n", ##__VA_ARGS__) -#define LOG_ERROR(msg, ...) fprintf(stderr, "[ERROR] " msg "\n", ##__VA_ARGS__) - typedef struct { char *url; char *title; @@ -55,4 +50,4 @@ int scrape_engine(const SearchEngine *engine, const char *query, int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs); -#endif
\ No newline at end of file +#endif |
