1
0

Refactoring of init.php

- fetching is now done by a global function in fetch.php. This
  allows to use this functions in test/xpath.php for testing.
- besides of user configuration as explained in README.d json
  files from `mods` are now also loaded and used for fetching
  articles
- loading of json files is done only one time and not for each
  article to process
- test/xpath.php is now a script, which can be used to test a
  configuration from `mods`-directory

Added two mods for 4players and visions.de.

TODO: rewrite README.md
This commit is contained in:
d
2014-05-15 12:42:56 +02:00
parent f668480416
commit 5448bd752d
5 changed files with 274 additions and 207 deletions

86
fetch.php Normal file
View File

@ -0,0 +1,86 @@
<?php
//
// Main global function for fetching and processing articles
//
function fetch_article($article, $config) {
// the only config type supported now is 'xpath'
if ($config['type'] != 'xpath')
return false;
$doc = new DOMDocument();
$link = trim($article['link']);
if (defined('VERSION') && version_compare(VERSION, '1.7.9', '>=')) {
$html = fetch_file_contents($link);
$content_type = $fetch_last_content_type;
}
else {
// fallback to file_get_contents()
$html = file_get_contents($link);
// try to fetch charset from HTTP headers
$headers = $http_response_header;
$content_type = false;
foreach ($headers as $h) {
if (substr(strtolower($h), 0, 13) == 'content-type:') {
$content_type = substr($h, 14);
// don't return here to find LATEST (if redirected) entry
}
}
}
$charset = false;
if (!isset($config['force_charset'])) {
if ($content_type) {
preg_match('/charset=(\S+)/', $content_type, $matches);
if (isset($matches[1]) && !empty($matches[1])) $charset = $matches[1];
}
} else {
// use forced charset
$charset = $config['force_charset'];
}
if ($charset && isset($config['force_unicode']) && $config['force_unicode']) {
$html = iconv($charset, 'utf-8', $html);
$charset = 'utf-8';
}
if ($charset) {
$html = '<?xml encoding="' . $charset . '">' . $html;
}
@$doc->loadHTML($html);
if ($doc) {
$basenode = false;
$xpath = new DOMXPath($doc);
$entries = $xpath->query('(//'.$config['xpath'].')'); // find main DIV according to config
if ($entries->length > 0) $basenode = $entries->item(0);
if ($basenode) {
// remove nodes from cleanup configuration
if (isset($config['cleanup'])) {
if (!is_array($config['cleanup'])) {
$config['cleanup'] = array($config['cleanup']);
}
foreach ($config['cleanup'] as $cleanup) {
$nodelist = $xpath->query('//'.$cleanup, $basenode);
foreach ($nodelist as $node) {
if ($node instanceof DOMAttr) {
$node->ownerElement->removeAttributeNode($node);
}
else {
$node->parentNode->removeChild($node);
}
}
}
}
return $doc->saveXML($basenode);
}
}
return $article;
}

146
init.php
View File

@ -1,9 +1,14 @@
<?php
require_once('fetch.php');
class Af_Feedmod extends Plugin implements IHandler
{
private $host;
private $mods;
private $mods_loaded = false;
function about()
{
return array(
@ -51,110 +56,60 @@ class Af_Feedmod extends Plugin implements IHandler
{
global $fetch_last_content_type;
$json_conf = $this->host->get($this, 'json_conf');
$owner_uid = $article['owner_uid'];
$data = json_decode($json_conf, true);
if (!is_array($data)) {
// no valid JSON or no configuration at all
//
// Load mods if they are not already loaded
//
if (!$this->mods_loaded) {
//
// Reading mod files
//
$this->mods = array();
// bad (!) hardcoded path
$mod_files = glob('plugins/af_feedmod/mods/*.json');
foreach ($mod_files as $file) {
$json = file_get_contents($file);
$mod = json_decode($json, true);
if (json_last_error() != JSON_ERROR_NONE)
continue;
if (!isset($mod['match']) || !isset($mod['config']))
continue;
$this->mods[$mod['match']] = $mod['config'];
}
//
// User mods
//
$json_conf = $this->host->get($this, 'json_conf');
$user_mods = json_decode($json_conf, true);
if (is_array($user_mods))
$this->mods = array_merge($this->mods, $user_mods);
$this->mods_loaded = true;
}
// article is already fetched
if (strpos($article['plugin_data'], "feedmod,$owner_uid:") !== false && isset($article['stored']['content']))
{
$article['content'] = $article['stored']['content'];
return $article;
}
foreach ($data as $urlpart=>$config) {
if (strpos($article['link'], $urlpart) === false) continue; // skip this config if URL not matching
if (strpos($article['plugin_data'], "feedmod,$owner_uid:") !== false) {
// do not process an article more than once
if (isset($article['stored']['content'])) $article['content'] = $article['stored']['content'];
foreach ($this->mods as $urlpart=>$config) {
if (strpos($article['link'], $urlpart) === false) continue;
$content = fetch_article($article, $config);
if (!$content)
break;
}
switch ($config['type']) {
case 'xpath':
$doc = new DOMDocument();
$link = trim($article['link']);
if (version_compare(VERSION, '1.7.9', '>=')) {
$html = fetch_file_contents($link);
$content_type = $fetch_last_content_type;
} else {
// fallback to file_get_contents()
$html = file_get_contents($link);
// try to fetch charset from HTTP headers
$headers = $http_response_header;
$content_type = false;
foreach ($headers as $h) {
if (substr(strtolower($h), 0, 13) == 'content-type:') {
$content_type = substr($h, 14);
// don't break here to find LATEST (if redirected) entry
}
}
}
$charset = false;
if (!isset($config['force_charset'])) {
if ($content_type) {
preg_match('/charset=(\S+)/', $content_type, $matches);
if (isset($matches[1]) && !empty($matches[1])) $charset = $matches[1];
}
} else {
// use forced charset
$charset = $config['force_charset'];
}
if ($charset && isset($config['force_unicode']) && $config['force_unicode']) {
$html = iconv($charset, 'utf-8', $html);
$charset = 'utf-8';
}
if ($charset) {
$html = '<?xml encoding="' . $charset . '">' . $html;
}
@$doc->loadHTML($html);
if ($doc) {
$basenode = false;
$xpath = new DOMXPath($doc);
$entries = $xpath->query('(//'.$config['xpath'].')'); // find main DIV according to config
if ($entries->length > 0) $basenode = $entries->item(0);
if ($basenode) {
// remove nodes from cleanup configuration
if (isset($config['cleanup'])) {
if (!is_array($config['cleanup'])) {
$config['cleanup'] = array($config['cleanup']);
}
foreach ($config['cleanup'] as $cleanup) {
$nodelist = $xpath->query('//'.$cleanup, $basenode);
foreach ($nodelist as $node) {
if ($node instanceof DOMAttr) {
$node->ownerElement->removeAttributeNode($node);
}
else {
$node->parentNode->removeChild($node);
}
}
}
}
$article['content'] = $doc->saveXML($basenode);
$article['content'] = $content;
$article['plugin_data'] = "feedmod,$owner_uid:" . $article['plugin_data'];
}
}
break;
default:
// unknown type or invalid config
continue;
}
break; // if we got here, we found the correct entry in $data, do not process more
}
return $article;
}
@ -212,5 +167,4 @@ class Af_Feedmod extends Plugin implements IHandler
$this->host->set($this, 'json_conf', $json_conf);
echo __("Configuration saved.");
}
}

12
mods/4player.de.json Normal file
View File

@ -0,0 +1,12 @@
{
"name": "4players.de",
"author": "boxdot",
"feed": "http://feeds.4players.de/Allgemein/news/-/rss.xml",
"match": "4players.de",
"config": {
"type": "xpath",
"xpath": "article",
"force_charset": "utf-8",
"cleanup": ["header", "footer", "div[contains(@class, 'social-facebook')]"]
}
}

11
mods/visions.de.json Normal file
View File

@ -0,0 +1,11 @@
{
"name": "VISIONS.de",
"author": "boxdot",
"feed": "http://rss.feedsportal.com/c/32350/f/443184/index.rss",
"match": "visions0Bde0C",
"config": {
"type": "xpath",
"xpath": "div[contains(@class, 'marginbt')]",
"force_charset": "utf-8"
}
}

View File

@ -1,33 +1,37 @@
<?php
$config = array(
'type' => 'xpath',
'xpath' => 'div[@itemprop="articleBody"]',
);
require_once('../fetch.php');
$article = array(
'link' => 'http://www.der-postillon.com/2013/04/nordkoreas-armee-nach-wochenlangem.html',
'content' => 'This is the feed content',
'plugin_data' => '',
);
$doc = new DOMDocument();
$html = file_get_contents($article['link']);
$doc->loadHTML($html);
if ($doc) {
$basenode = false;
$xpath = new DOMXPath($doc);
$entries = $xpath->query('(//'.$config['xpath'].')'); // find main DIV according to config
var_dump($entries);
if ($entries->length > 0) $basenode = $entries->item(0);
if ($basenode) {
$article['content'] = $doc->saveXML($basenode);
$article['plugin_data'] = "feedmod,$owner_uid:" . $article['plugin_data'];
}
if (count($argv) <= 2) {
echo 'USAGE: php fetch.php [mod_file] [article_url]' . PHP_EOL;
exit(1);
}
print_r($article);
$mod = $argv[1];
$article_url = $argv[2];
//
// Getting json config
//
$json = file_get_contents($mod);
$data = json_decode($json, true);
echo "<pre>";
print_r($data);
echo "</pre>";
if (json_last_error() != JSON_ERROR_NONE) {
echo 'Json error' . PHP_EOL;
exit(1);
}
$config = $data['config'];
//
// Fetching article
//
$owner_uid = 100;
$article = array( 'link' => $article_url, 'plugin_data' => '' );
echo fetch_article($article, $config)['content'];