summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorkdx <kikoodx@paranoici.org>2023-01-13 04:03:40 +0100
committerkdx <kikoodx@paranoici.org>2023-01-13 04:03:40 +0100
commit1684f585b61073f5fb13126868c4a6b2298c18d0 (patch)
tree1674877bcaea29f444aee3697ee43f6e23e8cd45
parentf9ca781f1222c95272b7f380460cf093821b5bee (diff)
downloadcite-scrapper-1684f585b61073f5fb13126868c4a6b2298c18d0.tar.gz
scrap movie posters
-rw-r--r--.gitignore1
-rwxr-xr-xbuild.sh4
-rw-r--r--cite.c34
-rwxr-xr-xscrap.sh8
4 files changed, 41 insertions, 6 deletions
diff --git a/.gitignore b/.gitignore
index b15486f..cba224e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
seances.xml
+scrap
cite
cite.html
citenofr.html
diff --git a/build.sh b/build.sh
index 983c5fd..772a236 100755
--- a/build.sh
+++ b/build.sh
@@ -1,2 +1,4 @@
#!/bin/sh
-gcc -g -Wall -Wextra -Wno-pointer-sign $(xml2-config --cflags --libs) -o cite cite.c
+gcc -Wall -Wextra -Wno-pointer-sign \
+ -lcurl $(xml2-config --cflags --libs) \
+ -o scrap cite.c
diff --git a/cite.c b/cite.c
index 8feb067..12545c5 100644
--- a/cite.c
+++ b/cite.c
@@ -1,3 +1,4 @@
+#include <curl/curl.h>
#include <libxml/parser.h>
#include <string.h>
@@ -18,6 +19,7 @@ typedef struct {
} Movie;
Movie movies[64] = {0};
+CURL *curl = NULL;
static void print_style(void)
{
@@ -58,6 +60,27 @@ static void print_movie(Movie *movie, int hide_fr)
printf("\n");
}
+static void download_poster(Movie *movie)
+{
+ char out_path[512];
+ char *last_slash = movie->poster;
+ while (strchr(last_slash, '/') != NULL)
+ last_slash = strchr(last_slash, '/') + 1;
+ strcpy(out_path, "cite/");
+ strcat(out_path, last_slash);
+ FILE *const fp = fopen(out_path, "wb");
+ if (fp == NULL)
+ return;
+ curl_easy_setopt(curl, CURLOPT_URL, movie->poster);
+ curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, NULL);
+ curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);
+ fprintf(stderr, "getting %s", out_path);
+ fprintf(stderr, "\rgot %s \n", out_path);
+ const CURLcode res = curl_easy_perform(curl);
+ if (res == CURLE_OK)
+ strcpy(movie->poster, last_slash);
+}
+
static void xfree(const void *ptr)
{
if (ptr != NULL)
@@ -170,6 +193,12 @@ int main(int argc, char **argv)
xmlFreeDoc(document);
return 1;
}
+ curl = curl_easy_init();
+ if (curl == NULL) {
+ xmlCleanupParser();
+ xmlFreeDoc(document);
+ return 1;
+ }
for (const xmlNode *week = root->children; week != NULL; week = week->next) {
for (const xmlNode *mov = get_movie_node(week); mov != NULL; mov = mov->next) {
const int id = get_id(mov);
@@ -185,8 +214,11 @@ int main(int argc, char **argv)
}
print_style();
printf("<h1><a href=\"http://www.citebd.org/spip.php?film2912\">cinéma de la cité</a></h1>\n");
- for (Movie *movie = movies; movie->id != 0; movie++)
+ for (Movie *movie = movies; movie->id != 0; movie++) {
+ download_poster(movie);
print_movie(movie, atoi(argv[2]));
+ }
+ curl_easy_cleanup(curl);
xmlFreeDoc(document);
xmlCleanupParser();
return 0;
diff --git a/scrap.sh b/scrap.sh
index bb4862a..9228443 100755
--- a/scrap.sh
+++ b/scrap.sh
@@ -1,7 +1,7 @@
#!/bin/sh
./build.sh || exit 1
curl -o seances.xml 'http://www.citebd.org/IMG/xml/allocineseances-4.xml' || exit 1
-./cite seances.xml 0 >cite.html || exit 1
-scp cite.html root@kdx.re:/var/www/html
-./cite seances.xml 1 >citenofr.html || exit 1
-scp citenofr.html root@kdx.re:/var/www/html
+mkdir -p cite
+./scrap seances.xml 0 >cite/index.html || exit 1
+./scrap seances.xml 1 >cite/nofr.html || exit 1
+rsync -rvu --delete cite root@kdx.re:/var/www/html