Added charset detection from HTTP headers. Fixes #1
This commit is contained in:
parent
f0d9a52e02
commit
60ee671064
26
README.md
26
README.md
@ -1,10 +1,11 @@
|
||||
ttrss_plugin-af_feedmod
|
||||
=======================
|
||||
|
||||
Installation
|
||||
--------------------
|
||||
This is a plugin for Tiny Tiny RSS (tt-rss). It allows you to replace an article's contents by the contents of an element on the linked URL's page, i.e. create a "full feed".
|
||||
|
||||
This is a plugin for Tiny Tiny RSS (tt-rss). It allows you to replace an article's contents by the contents of an element on the linked URL's page.
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
Checkout the directory into your plugins folder like this (from tt-RSS root directory):
|
||||
|
||||
@ -17,7 +18,7 @@ Then enable the plugin in preferences.
|
||||
|
||||
|
||||
Configuration
|
||||
--------------------
|
||||
-------------
|
||||
|
||||
The configuration is done in JSON format. In the preferences, you'll find a new tab called *FeedMod*. Use the large field to enter/modify the configuration data and click the **Save** button to store it.
|
||||
|
||||
@ -28,7 +29,8 @@ A configuration looks like this:
|
||||
|
||||
"heise.de": {
|
||||
"type": "xpath",
|
||||
"xpath": "div[@class='meldung_wrapper']"
|
||||
"xpath": "div[@class='meldung_wrapper']",
|
||||
"force_charset": "utf-8"
|
||||
},
|
||||
"berlin.de/polizei": {
|
||||
"type": "xpath",
|
||||
@ -41,6 +43,14 @@ A configuration looks like this:
|
||||
"golem0Bde0C": {
|
||||
"type": "xpath",
|
||||
"xpath": "article"
|
||||
},
|
||||
"oatmeal": {
|
||||
"type": "xpath",
|
||||
"xpath": "div[@id='comic']"
|
||||
},
|
||||
"blog.beetlebum.de": {
|
||||
"type": "xpath",
|
||||
"xpath": "div[@class='entry-content']"
|
||||
}
|
||||
|
||||
}
|
||||
@ -48,9 +58,11 @@ A configuration looks like this:
|
||||
|
||||
The *array key* is part of the URL of the article links(!). You'll notice the `golem0Bde0C` in the last entry: That's because all their articles link to something like `http://rss.feedsportal.com/c/33374/f/578068/p/1/s/3f6db44e/l/0L0Sgolem0Bde0Cnews0Cthis0Eis0Ean0Eexample0A10Erss0Bhtml/story01.htm` and to have the plugin match that URL and not interfere with other feeds using *feedsportal.com*, I used the part `golem0Bde0C`.
|
||||
|
||||
The **type** has to be `xpath` for now. Maybe there will be more types in the future.
|
||||
**type** has to be `xpath` for now. Maybe there will be more types in the future.
|
||||
|
||||
The **xpath** value is the actual Xpath-element to fetch from the linked page.
|
||||
The **xpath** value is the actual Xpath-element to fetch from the linked page. Omit the leading `//` - they will get prepended automatically.
|
||||
|
||||
**force_charset** allows to override automatic charset detection. If it is omitted, the charset will be parsed from the HTTP headers or loadHTML() will decide on its own.
|
||||
|
||||
|
||||
If you get an error about "Invalid JSON!", you can use [JSONLint](http://jsonlint.com/) to locate the erroneous part.
|
||||
|
38
init.php
38
init.php
@ -46,6 +46,8 @@ class Af_Feedmod extends Plugin implements IHandler
|
||||
|
||||
function hook_article_filter($article)
|
||||
{
|
||||
global $fetch_last_content_type;
|
||||
|
||||
$json_conf = $this->host->get($this, 'json_conf');
|
||||
$owner_uid = $article['owner_uid'];
|
||||
$data = json_decode($json_conf, true);
|
||||
@ -61,7 +63,41 @@ class Af_Feedmod extends Plugin implements IHandler
|
||||
switch ($config['type']) {
|
||||
case 'xpath':
|
||||
$doc = new DOMDocument();
|
||||
@$doc->loadHTML(fetch_file_contents($article['link']));
|
||||
|
||||
if (version_compare(VERSION, '1.7.9', '>=')) {
|
||||
$html = fetch_file_contents($article['link']);
|
||||
$content_type = $fetch_last_content_type;
|
||||
} else {
|
||||
// fallback to file_get_contents()
|
||||
$html = file_get_contents($article['link']);
|
||||
|
||||
// try to fetch charset from HTTP headers
|
||||
$headers = $http_response_header;
|
||||
$content_type = false;
|
||||
foreach ($headers as $h) {
|
||||
if (substr(strtolower($h), 0, 13) == 'content-type:') {
|
||||
$content_type = substr($h, 14);
|
||||
// don't break here to find LATEST (if redirected) entry
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!isset($config['force_charset'])) {
|
||||
$charset = false;
|
||||
if ($content_type) {
|
||||
preg_match('/charset=(\S+)/', $content_type, $matches);
|
||||
if (isset($matches[1]) && !empty($matches[1])) $charset = $matches[1];
|
||||
}
|
||||
|
||||
if ($charset) {
|
||||
$html = '<?xml encoding="' . $charset . '">' . $html;
|
||||
}
|
||||
} else {
|
||||
// use forced charset
|
||||
$html = '<?xml encoding="' . $config['force_charset'] . '">' . $html;
|
||||
}
|
||||
|
||||
@$doc->loadHTML($html);
|
||||
|
||||
if ($doc) {
|
||||
$basenode = false;
|
||||
|
60
tests/charset.php
Normal file
60
tests/charset.php
Normal file
@ -0,0 +1,60 @@
|
||||
<?php
|
||||
|
||||
$config = array(
|
||||
'type' => 'xpath',
|
||||
'xpath' => 'div[@class="meldung_wrapper"]',
|
||||
);
|
||||
|
||||
// http://www.heise.de/newsticker/heise-atom.xml
|
||||
|
||||
$article = array(
|
||||
'link' => 'http://www.heise.de/newsticker/meldung/Fruehjahrspatches-Microsoft-9-Adobe-3-1838175.html/from/atom10',
|
||||
'content' => 'This is the feed content',
|
||||
'plugin_data' => '',
|
||||
);
|
||||
|
||||
$doc = new DOMDocument();
|
||||
|
||||
$html = file_get_contents($article['link']);
|
||||
|
||||
// BEGIN --- New code
|
||||
$headers = $http_response_header;
|
||||
$content_type = false;
|
||||
foreach ($headers as $h) {
|
||||
if (substr(strtolower($h), 0, 13) == 'content-type:') {
|
||||
$content_type = substr($h, 14);
|
||||
// don't break here to find LATEST (if redirected) entry
|
||||
}
|
||||
}
|
||||
|
||||
$charset = false;
|
||||
if ($content_type) {
|
||||
preg_match('/charset=(\S+)/', $content_type, $matches);
|
||||
if (isset($matches[1]) && !empty($matches[1])) $charset = $matches[1];
|
||||
}
|
||||
|
||||
// END --- New code
|
||||
|
||||
echo 'CHARSET: ' . $charset . PHP_EOL;
|
||||
|
||||
|
||||
$doc->loadHTML('<?xml encoding="' . $charset . '">' . $html);
|
||||
|
||||
echo 'ENCODING: ' . $doc->encoding . PHP_EOL;
|
||||
|
||||
if ($doc) {
|
||||
$basenode = false;
|
||||
$xpath = new DOMXPath($doc);
|
||||
$entries = $xpath->query('(//'.$config['xpath'].')'); // find main DIV according to config
|
||||
|
||||
var_dump($entries);
|
||||
|
||||
if ($entries->length > 0) $basenode = $entries->item(0);
|
||||
|
||||
if ($basenode) {
|
||||
$article['content'] = $doc->saveXML($basenode);
|
||||
$article['plugin_data'] = "feedmod,$owner_uid:" . $article['plugin_data'];
|
||||
}
|
||||
}
|
||||
|
||||
print_r($article);
|
11
tests/charsetcurl.php
Normal file
11
tests/charsetcurl.php
Normal file
@ -0,0 +1,11 @@
|
||||
<?php
|
||||
|
||||
$ch = curl_init('http://www.heise.de/newsticker/meldung/Fruehjahrspatches-Microsoft-9-Adobe-3-1838175.html/from/atom10');
|
||||
#curl_setopt($ch, CURLOPT_URL, "http://www.example.com/");
|
||||
#curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||||
#curl_setopt($ch, CURLINFO_HEADER_OUT, true);
|
||||
|
||||
curl_exec($ch);
|
||||
|
||||
var_dump(curl_getinfo($ch,CURLINFO_CONTENT_TYPE));
|
||||
|
Reference in New Issue
Block a user