1
0

Added charset detection from HTTP headers. Fixes #1

This commit is contained in:
2013-04-10 11:36:25 +02:00
parent f0d9a52e02
commit 60ee671064
4 changed files with 127 additions and 8 deletions

60
tests/charset.php Normal file
View File

@ -0,0 +1,60 @@
<?php
$config = array(
'type' => 'xpath',
'xpath' => 'div[@class="meldung_wrapper"]',
);
// http://www.heise.de/newsticker/heise-atom.xml
$article = array(
'link' => 'http://www.heise.de/newsticker/meldung/Fruehjahrspatches-Microsoft-9-Adobe-3-1838175.html/from/atom10',
'content' => 'This is the feed content',
'plugin_data' => '',
);
$doc = new DOMDocument();
$html = file_get_contents($article['link']);
// BEGIN --- New code
$headers = $http_response_header;
$content_type = false;
foreach ($headers as $h) {
if (substr(strtolower($h), 0, 13) == 'content-type:') {
$content_type = substr($h, 14);
// don't break here to find LATEST (if redirected) entry
}
}
$charset = false;
if ($content_type) {
preg_match('/charset=(\S+)/', $content_type, $matches);
if (isset($matches[1]) && !empty($matches[1])) $charset = $matches[1];
}
// END --- New code
echo 'CHARSET: ' . $charset . PHP_EOL;
$doc->loadHTML('<?xml encoding="' . $charset . '">' . $html);
echo 'ENCODING: ' . $doc->encoding . PHP_EOL;
if ($doc) {
$basenode = false;
$xpath = new DOMXPath($doc);
$entries = $xpath->query('(//'.$config['xpath'].')'); // find main DIV according to config
var_dump($entries);
if ($entries->length > 0) $basenode = $entries->item(0);
if ($basenode) {
$article['content'] = $doc->saveXML($basenode);
$article['plugin_data'] = "feedmod,$owner_uid:" . $article['plugin_data'];
}
}
print_r($article);

11
tests/charsetcurl.php Normal file
View File

@ -0,0 +1,11 @@
<?php
$ch = curl_init('http://www.heise.de/newsticker/meldung/Fruehjahrspatches-Microsoft-9-Adobe-3-1838175.html/from/atom10');
#curl_setopt($ch, CURLOPT_URL, "http://www.example.com/");
#curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
#curl_setopt($ch, CURLINFO_HEADER_OUT, true);
curl_exec($ch);
var_dump(curl_getinfo($ch,CURLINFO_CONTENT_TYPE));