1
0

Added charset detection from HTTP headers. Fixes #1

This commit is contained in:
2013-04-10 11:36:25 +02:00
parent f0d9a52e02
commit 60ee671064
4 changed files with 127 additions and 8 deletions

View File

@ -46,6 +46,8 @@ class Af_Feedmod extends Plugin implements IHandler
function hook_article_filter($article)
{
global $fetch_last_content_type;
$json_conf = $this->host->get($this, 'json_conf');
$owner_uid = $article['owner_uid'];
$data = json_decode($json_conf, true);
@ -61,7 +63,41 @@ class Af_Feedmod extends Plugin implements IHandler
switch ($config['type']) {
case 'xpath':
$doc = new DOMDocument();
@$doc->loadHTML(fetch_file_contents($article['link']));
if (version_compare(VERSION, '1.7.9', '>=')) {
$html = fetch_file_contents($article['link']);
$content_type = $fetch_last_content_type;
} else {
// fallback to file_get_contents()
$html = file_get_contents($article['link']);
// try to fetch charset from HTTP headers
$headers = $http_response_header;
$content_type = false;
foreach ($headers as $h) {
if (substr(strtolower($h), 0, 13) == 'content-type:') {
$content_type = substr($h, 14);
// don't break here to find LATEST (if redirected) entry
}
}
}
if (!isset($config['force_charset'])) {
$charset = false;
if ($content_type) {
preg_match('/charset=(\S+)/', $content_type, $matches);
if (isset($matches[1]) && !empty($matches[1])) $charset = $matches[1];
}
if ($charset) {
$html = '<?xml encoding="' . $charset . '">' . $html;
}
} else {
// use forced charset
$html = '<?xml encoding="' . $config['force_charset'] . '">' . $html;
}
@$doc->loadHTML($html);
if ($doc) {
$basenode = false;