Added charset detection from HTTP headers. Fixes #1

2013-04-10 11:36:25 +02:00 · 2013-04-10 11:36:25 +02:00 · 60ee671064
commit 60ee671064
parent f0d9a52e02
4 changed files with 127 additions and 8 deletions
--- a/README.md
+++ b/README.md
@ -1,10 +1,11 @@
 ttrss_plugin-af_feedmod
 =======================

-Installation
--------------------
+This is a plugin for Tiny Tiny RSS (tt-rss). It allows you to replace an article's contents by the contents of an element on the linked URL's page, i.e. create a "full feed".

-This is a plugin for Tiny Tiny RSS (tt-rss). It allows you to replace an article's contents by the contents of an element on the linked URL's page.
+
+Installation
+------------

 Checkout the directory into your plugins folder like this (from tt-RSS root directory):

@ -17,7 +18,7 @@ Then enable the plugin in preferences.


 Configuration
--------------------
+-------------

 The configuration is done in JSON format. In the preferences, you'll find a new tab called *FeedMod*. Use the large field to enter/modify the configuration data and click the **Save** button to store it.

@ -28,7 +29,8 @@ A configuration looks like this:

 "heise.de": {
    "type": "xpath",
-    "xpath": "div[@class='meldung_wrapper']"
+    "xpath": "div[@class='meldung_wrapper']",
+    "force_charset": "utf-8"
 },
 "berlin.de/polizei": {
    "type": "xpath",
@ -41,6 +43,14 @@ A configuration looks like this:
 "golem0Bde0C": {
    "type": "xpath",
    "xpath": "article"
+},
+"oatmeal": {
+    "type": "xpath",
+    "xpath": "div[@id='comic']"
+},
+"blog.beetlebum.de": {
+    "type": "xpath",
+    "xpath": "div[@class='entry-content']"
 }

 }
@ -48,9 +58,11 @@ A configuration looks like this:

 The *array key* is part of the URL of the article links(!). You'll notice the `golem0Bde0C` in the last entry: That's because all their articles link to something like `http://rss.feedsportal.com/c/33374/f/578068/p/1/s/3f6db44e/l/0L0Sgolem0Bde0Cnews0Cthis0Eis0Ean0Eexample0A10Erss0Bhtml/story01.htm` and to have the plugin match that URL and not interfere with other feeds using *feedsportal.com*, I used the part `golem0Bde0C`.

-The **type** has to be `xpath` for now. Maybe there will be more types in the future.
+**type** has to be `xpath` for now. Maybe there will be more types in the future.

-The **xpath** value is the actual Xpath-element to fetch from the linked page.
+The **xpath** value is the actual Xpath-element to fetch from the linked page. Omit the leading `//` - they will get prepended automatically.
+
+**force_charset** allows to override automatic charset detection. If it is omitted, the charset will be parsed from the HTTP headers or loadHTML() will decide on its own.


 If you get an error about "Invalid JSON!", you can use [JSONLint](http://jsonlint.com/) to locate the erroneous part.
--- a/init.php
+++ b/init.php
@ -46,6 +46,8 @@ class Af_Feedmod extends Plugin implements IHandler

    function hook_article_filter($article)
    {
+        global $fetch_last_content_type;
+
        $json_conf = $this->host->get($this, 'json_conf');
        $owner_uid = $article['owner_uid'];
        $data = json_decode($json_conf, true);
@ -61,7 +63,41 @@ class Af_Feedmod extends Plugin implements IHandler
            switch ($config['type']) {
                case 'xpath':
                    $doc = new DOMDocument();
-                    @$doc->loadHTML(fetch_file_contents($article['link']));
+
+                    if (version_compare(VERSION, '1.7.9', '>=')) {
+                        $html = fetch_file_contents($article['link']);
+                        $content_type = $fetch_last_content_type;
+                    } else {
+                        // fallback to file_get_contents()
+                        $html = file_get_contents($article['link']);
+
+                        // try to fetch charset from HTTP headers
+                        $headers = $http_response_header;
+                        $content_type = false;
+                        foreach ($headers as $h) {
+                            if (substr(strtolower($h), 0, 13) == 'content-type:') {
+                                $content_type = substr($h, 14);
+                                // don't break here to find LATEST (if redirected) entry
+                            }
+                        }
+                    }
+
+                    if (!isset($config['force_charset'])) {
+                        $charset = false;
+                        if ($content_type) {
+                            preg_match('/charset=(\S+)/', $content_type, $matches);
+                            if (isset($matches[1]) && !empty($matches[1])) $charset = $matches[1];
+                        }
+
+                        if ($charset) {
+                            $html = '<?xml encoding="' . $charset . '">' . $html;
+                        }
+                    } else {
+                        // use forced charset
+                        $html = '<?xml encoding="' . $config['force_charset'] . '">' . $html;
+                    }
+
+                    @$doc->loadHTML($html);

                    if ($doc) {
                        $basenode = false;
--- a/tests/charset.php
+++ b/tests/charset.php
@ -0,0 +1,60 @@
+<?php
+
+$config = array(
+    'type' => 'xpath',
+    'xpath' => 'div[@class="meldung_wrapper"]',
+);
+
+// http://www.heise.de/newsticker/heise-atom.xml
+
+$article = array(
+    'link' => 'http://www.heise.de/newsticker/meldung/Fruehjahrspatches-Microsoft-9-Adobe-3-1838175.html/from/atom10',
+    'content' => 'This is the feed content',
+    'plugin_data' => '',
+);
+
+$doc = new DOMDocument();
+
+$html = file_get_contents($article['link']);
+
+// BEGIN --- New code
+$headers = $http_response_header;
+$content_type = false;
+foreach ($headers as $h) {
+    if (substr(strtolower($h), 0, 13) == 'content-type:') {
+        $content_type = substr($h, 14);
+        // don't break here to find LATEST (if redirected) entry
+    }
+}
+
+$charset = false;
+if ($content_type) {
+    preg_match('/charset=(\S+)/', $content_type, $matches);
+    if (isset($matches[1]) && !empty($matches[1])) $charset = $matches[1];
+}
+
+// END --- New code
+
+echo 'CHARSET: ' . $charset . PHP_EOL;
+
+
+$doc->loadHTML('<?xml encoding="' . $charset . '">' . $html);
+
+echo 'ENCODING: ' . $doc->encoding . PHP_EOL;
+
+if ($doc) {
+    $basenode = false;
+    $xpath = new DOMXPath($doc);
+    $entries = $xpath->query('(//'.$config['xpath'].')');   // find main DIV according to config
+
+    var_dump($entries);
+
+    if ($entries->length > 0) $basenode = $entries->item(0);
+
+    if ($basenode) {
+       $article['content'] = $doc->saveXML($basenode);
+       $article['plugin_data'] = "feedmod,$owner_uid:" . $article['plugin_data'];
+    }
+}
+
+print_r($article);
--- a/tests/charsetcurl.php
+++ b/tests/charsetcurl.php
@ -0,0 +1,11 @@
+<?php
+
+$ch = curl_init('http://www.heise.de/newsticker/meldung/Fruehjahrspatches-Microsoft-9-Adobe-3-1838175.html/from/atom10');
+#curl_setopt($ch, CURLOPT_URL, "http://www.example.com/");
+#curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
+#curl_setopt($ch, CURLINFO_HEADER_OUT, true);
+
+curl_exec($ch);
+
+var_dump(curl_getinfo($ch,CURLINFO_CONTENT_TYPE));
+