aboutsummaryrefslogtreecommitdiff
path: root/src/Scraping/Scraping.h
blob: be65e5a0c97a1e9fce2f67aa1a1a600f8659ff86 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#ifndef SCRAPING_H
#define SCRAPING_H

#include <curl/curl.h>
#include <libxml/HTMLparser.h>
#include <libxml/xpath.h>

typedef struct {
  char *url;
  char *title;
  char *snippet;
} SearchResult;

typedef int (*ParserFunc)(const char *engine_name, xmlDocPtr doc,
                          SearchResult **out_results, int max_results);

typedef struct {
  const char *id;
  const char *name;
  const char *base_url;
  const char *host_header;
  const char *referer;

  const char *page_param;
  int page_multiplier;
  int page_base;
  ParserFunc parser;
  int enabled;
} SearchEngine;

typedef struct {
  char *memory;
  size_t size;
  size_t capacity;
} MemoryBuffer;

typedef enum {
  SCRAPE_STATUS_PENDING,
  SCRAPE_STATUS_OK,
  SCRAPE_STATUS_EMPTY,
  SCRAPE_STATUS_FETCH_ERROR,
  SCRAPE_STATUS_PARSE_MISMATCH,
  SCRAPE_STATUS_BLOCKED,
} ScrapeStatus;

typedef struct {
  const SearchEngine *engine;
  char *query;
  SearchResult **out_results;
  int max_results;
  int page;
  CURL *handle;
  MemoryBuffer response;
  int results_count;
  long http_status;
  ScrapeStatus status;
} ScrapeJob;

extern SearchEngine ENGINE_REGISTRY[];
extern const int ENGINE_COUNT;
void apply_engines_config(const char *engines_str);

size_t write_memory_callback(void *contents, size_t size, size_t nmemb,
                             void *userp);
const char *get_random_user_agent(void);
void configure_curl_handle(CURL *curl, const char *full_url,
                           MemoryBuffer *chunk, struct curl_slist *headers);
char *build_search_url(const char *base_url, const char *page_param,
                       int page_multiplier, int page_base,
                       const char *encoded_query, int page);
struct curl_slist *build_request_headers(const char *host_header,
                                         const char *referer);
void http_delay(void);

xmlXPathContextPtr create_xpath_context(xmlDocPtr doc);
void free_xpath_objects(xmlXPathContextPtr ctx, xmlXPathObjectPtr obj);
SearchResult *alloc_results_array(int capacity, int max_results);
void assign_result(SearchResult *result, char *url, char *title, char *snippet,
                   int unescape);
void free_xml_node_list(char *title, char *url, char *snippet);

int scrape_engine(const SearchEngine *engine, const char *query,
                  SearchResult **out_results, int max_results);

int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs);

#endif