From b280ab6bcdf6c9bae46a7a21b7138d46d953dd71 Mon Sep 17 00:00:00 2001 From: frosty Date: Mon, 23 Feb 2026 00:57:21 -0500 Subject: oopsies --- src/Scraping/Scraping.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 src/Scraping/Scraping.h (limited to 'src/Scraping/Scraping.h') diff --git a/src/Scraping/Scraping.h b/src/Scraping/Scraping.h new file mode 100644 index 0000000..d8a3b13 --- /dev/null +++ b/src/Scraping/Scraping.h @@ -0,0 +1,58 @@ +#ifndef SCRAPING_H +#define SCRAPING_H + +#include +#include + +#define LOG_INFO(msg, ...) fprintf(stderr, "[INFO] " msg "\n", ##__VA_ARGS__) +#define LOG_WARN(msg, ...) fprintf(stderr, "[WARN] " msg "\n", ##__VA_ARGS__) +#define LOG_DEBUG(msg, ...) fprintf(stderr, "[DEBUG] " msg "\n", ##__VA_ARGS__) +#define LOG_ERROR(msg, ...) fprintf(stderr, "[ERROR] " msg "\n", ##__VA_ARGS__) + +typedef struct { + char *url; + char *title; + char *snippet; +} SearchResult; + +typedef int (*ParserFunc)(const char *engine_name, xmlDocPtr doc, + SearchResult **out_results, int max_results); + +typedef struct { + const char *name; + const char *base_url; + const char *host_header; + const char *referer; + + const char *page_param; + int page_multiplier; + int page_base; + ParserFunc parser; +} SearchEngine; + +typedef struct { + char *memory; + size_t size; + size_t capacity; +} MemoryBuffer; + +typedef struct { + const SearchEngine *engine; + char *query; + SearchResult **out_results; + int max_results; + int page; + CURL *handle; + MemoryBuffer response; + int results_count; +} ScrapeJob; + +extern const SearchEngine ENGINE_REGISTRY[]; +extern const int ENGINE_COUNT; + +int scrape_engine(const SearchEngine *engine, const char *query, + SearchResult **out_results, int max_results); + +int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs); + +#endif \ No newline at end of file -- cgit v1.2.3