Browse code

initial import

devnewton authored on 23/06/2017 08:27:09
Showing 2 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,30 @@
1
+<?xml version="1.0" encoding="UTF-8"?>
2
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
3
+    <modelVersion>4.0.0</modelVersion>
4
+    <groupId>im.bci</groupId>
5
+    <artifactId>fta2tsv</artifactId>
6
+    <version>1.0-SNAPSHOT</version>
7
+    <packaging>jar</packaging>
8
+    <properties>
9
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
10
+        <maven.compiler.source>1.8</maven.compiler.source>
11
+        <maven.compiler.target>1.8</maven.compiler.target>
12
+    </properties>
13
+    <dependencies>
14
+        <dependency>
15
+            <groupId>io.parallec</groupId>
16
+            <artifactId>parallec-core</artifactId>
17
+            <version>0.10.5</version>
18
+        </dependency>
19
+        <dependency>
20
+            <groupId>org.apache.commons</groupId>
21
+            <artifactId>commons-csv</artifactId>
22
+            <version>1.4</version>
23
+        </dependency>
24
+        <dependency>
25
+            <groupId>org.apache.commons</groupId>
26
+            <artifactId>commons-lang3</artifactId>
27
+            <version>3.5</version>
28
+        </dependency>
29
+    </dependencies>
30
+</project>
0 31
\ No newline at end of file
1 32
new file mode 100644
... ...
@@ -0,0 +1,109 @@
1
+package im.bci.fta2tsv;
2
+
3
+import io.parallec.core.ParallecHeader;
4
+import io.parallec.core.ParallecResponseHandler;
5
+import io.parallec.core.ParallelClient;
6
+import io.parallec.core.ResponseOnSingleTask;
7
+import java.io.File;
8
+import java.io.IOException;
9
+import java.nio.charset.Charset;
10
+import java.time.LocalDate;
11
+import java.time.format.DateTimeFormatter;
12
+import java.util.ArrayList;
13
+import java.util.Map;
14
+import java.util.logging.Level;
15
+import java.util.logging.Logger;
16
+import org.apache.commons.io.FileUtils;
17
+import org.apache.commons.io.IOUtils;
18
+import org.apache.commons.lang3.StringUtils;
19
+
20
+/**
21
+ *
22
+ * @author devnewton
23
+ */
24
+public class Fta2Tsv {
25
+
26
+    private static final String[] TRIBUNES = {"batavie", "dlfp", "euromussels", "finss", "eurofaab", "old-dlfp"};
27
+    private static final LocalDate START_DATE = LocalDate.of(2017, 6, 20);
28
+    private static final LocalDate END_DATE = LocalDate.now().plusDays(1);
29
+
30
+    public static void main(String[] args) {
31
+        ArrayList<String> requests = new ArrayList<>();
32
+        for (String tribune : TRIBUNES) {
33
+            for (LocalDate date = START_DATE; date.isBefore(END_DATE); date = date.plusDays(1)) {
34
+                requests.add(tribune + "/" + date.toString());
35
+            }
36
+        }
37
+        ParallelClient pc = new ParallelClient();
38
+        pc.prepareHttpGet("/t/$REQ")
39
+                .setHttpHeaders(new ParallecHeader().addPair("Accept", "text/tab-separated-values"))
40
+                .setReplaceVarMapToSingleTargetSingleVar("REQ", requests, "bombefourchette.com")
41
+                .execute(new ParallecResponseHandler() {
42
+                    @Override
43
+                    public void onCompleted(ResponseOnSingleTask res, Map<String, Object> responseContext) {
44
+                        String body = res.getResponseContent();
45
+                        if(StringUtils.isNotBlank(body)) {
46
+                            try {
47
+                                String filename = StringUtils.removeStart(res.getRequest().getResourcePath(), "/t/");
48
+                                filename = StringUtils.replace(filename, "/", "_") + ".tsv";
49
+                                FileUtils.write(new File(filename), body, "UTF-8");
50
+                            } catch (IOException ex) {
51
+                                Logger.getLogger(Fta2Tsv.class.getName()).log(Level.SEVERE, null, ex);
52
+                            }
53
+                        }
54
+                        System.out.println(body);
55
+                    }
56
+                });
57
+
58
+    }
59
+
60
+    private static final DateTimeFormatter POST_DATE_TIME_FORMATTER = DateTimeFormatter.ofPattern("yyyyMMddhhmmss");
61
+    private static final DateTimeFormatter POST_TIME_FORMATTER = DateTimeFormatter.ofPattern("HH:mm:ss");/*
62
+	private static final Whitelist MESSAGE_WHITELIST = Whitelist.none().addTags("b", "i", "s", "u", "tt", "code", "spoiler");
63
+
64
+    
65
+    private static void retrieve(String tribune, LocalDate date) {
66
+        String url = "http://bombefourchette.com/t/" + tribune + "/" + date.toString();
67
+        Element debugPost = null;
68
+        try (FileWriter fw = new FileWriter(tribune + "-" + date.toString() + ".tsv")) {
69
+            Document doc = Jsoup.connect(url).get();
70
+            final CSVPrinter printer = CSVFormat.TDF.print(fw);
71
+            for (Element post : doc.select("#posts > li")) {
72
+                debugPost = post;
73
+                Element firstA = post.select("a:first-child").get(0);
74
+                printer.print(firstA.attr("id"));
75
+                LocalDateTime dateTime = LocalDateTime.of(date, LocalTime.parse(firstA.select(".horloge").text(), POST_TIME_FORMATTER));
76
+                printer.print(dateTime.format(POST_DATE_TIME_FORMATTER));
77
+                String info = "";
78
+                for (Element infoElement : firstA.select(".info")) {
79
+                    info = StringUtils.removeEnd(infoElement.text(), ">");
80
+                    infoElement.remove();
81
+                }
82
+                String login = "";
83
+                for (Element loginElement : firstA.select(".login")) {
84
+                    login = StringUtils.removeEnd(loginElement.text(), ">");
85
+                    loginElement.remove();
86
+                }
87
+                printer.print(info);
88
+                printer.print(login);
89
+                firstA.replaceWith(TextNode.createFromEncoded(firstA.html(), null));
90
+
91
+                for (Element element : doc.body().children().select(":not(a,b,i,s,u,tt,code,spoiler)")) {
92
+                    element.replaceWith(TextNode.createFromEncoded(element.toString(), null));
93
+                }
94
+                for (Element a : post.select("a")) {
95
+                    a.replaceWith(TextNode.createFromEncoded(a.attr("href"), null));
96
+                }
97
+                		Cleaner cleaner = new Cleaner(MESSAGE_WHITELIST);
98
+		String message = cleaner.clean(Jsoup.parse(post.html())).html();
99
+
100
+                printer.print(message);
101
+                printer.println();
102
+            }
103
+        } catch (Exception e) {
104
+            System.err.println("Cannot retrieve " + url);
105
+            System.err.println(debugPost);
106
+            System.err.println(e);
107
+        }
108
+    }*/
109
+}