From 5448bd752d281c1c86e47a7e50a14ce3e830cb1d Mon Sep 17 00:00:00 2001 From: d Date: Thu, 15 May 2014 12:42:56 +0200 Subject: [PATCH] Refactoring of init.php - fetching is now done by a global function in fetch.php. This allows to use this functions in test/xpath.php for testing. - besides of user configuration as explained in README.d json files from `mods` are now also loaded and used for fetching articles - loading of json files is done only one time and not for each article to process - test/xpath.php is now a script, which can be used to test a configuration from `mods`-directory Added two mods for 4players and visions.de. TODO: rewrite README.md --- fetch.php | 86 ++++++++++++ init.php | 312 ++++++++++++++++++------------------------- mods/4player.de.json | 12 ++ mods/visions.de.json | 11 ++ tests/xpath.php | 60 +++++---- 5 files changed, 274 insertions(+), 207 deletions(-) create mode 100644 fetch.php create mode 100644 mods/4player.de.json create mode 100644 mods/visions.de.json diff --git a/fetch.php b/fetch.php new file mode 100644 index 0000000..726059a --- /dev/null +++ b/fetch.php @@ -0,0 +1,86 @@ +=')) { + $html = fetch_file_contents($link); + $content_type = $fetch_last_content_type; + } + else { + // fallback to file_get_contents() + $html = file_get_contents($link); + + // try to fetch charset from HTTP headers + $headers = $http_response_header; + $content_type = false; + foreach ($headers as $h) { + if (substr(strtolower($h), 0, 13) == 'content-type:') { + $content_type = substr($h, 14); + // don't return here to find LATEST (if redirected) entry + } + } + } + + $charset = false; + if (!isset($config['force_charset'])) { + if ($content_type) { + preg_match('/charset=(\S+)/', $content_type, $matches); + if (isset($matches[1]) && !empty($matches[1])) $charset = $matches[1]; + } + } else { + // use forced charset + $charset = $config['force_charset']; + } + + if ($charset && isset($config['force_unicode']) && $config['force_unicode']) { + $html = iconv($charset, 'utf-8', $html); + $charset = 'utf-8'; + } + + if ($charset) { + $html = '' . $html; + } + + @$doc->loadHTML($html); + + if ($doc) { + $basenode = false; + $xpath = new DOMXPath($doc); + $entries = $xpath->query('(//'.$config['xpath'].')'); // find main DIV according to config + + if ($entries->length > 0) $basenode = $entries->item(0); + + if ($basenode) { + // remove nodes from cleanup configuration + if (isset($config['cleanup'])) { + if (!is_array($config['cleanup'])) { + $config['cleanup'] = array($config['cleanup']); + } + foreach ($config['cleanup'] as $cleanup) { + $nodelist = $xpath->query('//'.$cleanup, $basenode); + foreach ($nodelist as $node) { + if ($node instanceof DOMAttr) { + $node->ownerElement->removeAttributeNode($node); + } + else { + $node->parentNode->removeChild($node); + } + } + } + } + return $doc->saveXML($basenode); + } + } + + return $article; +} \ No newline at end of file diff --git a/init.php b/init.php index d324bad..c60c3c3 100644 --- a/init.php +++ b/init.php @@ -1,216 +1,170 @@ host = $host; + function api_version() + { + return 2; + } - $host->add_hook($host::HOOK_PREFS_TABS, $this); -# only allowed for system plugins: $host->add_handler('pref-feedmod', '*', $this); - $host->add_hook($host::HOOK_ARTICLE_FILTER, $this); - } + function init($host) + { + $this->host = $host; - function csrf_ignore($method) - { - $csrf_ignored = array("index", "edit"); - return array_search($method, $csrf_ignored) !== false; - } + $host->add_hook($host::HOOK_PREFS_TABS, $this); + # only allowed for system plugins: $host->add_handler('pref-feedmod', '*', $this); + $host->add_hook($host::HOOK_ARTICLE_FILTER, $this); + } - function before($method) - { - if ($_SESSION["uid"]) { - return true; - } - return false; - } + function csrf_ignore($method) + { + $csrf_ignored = array("index", "edit"); + return array_search($method, $csrf_ignored) !== false; + } - function after() - { - return true; - } + function before($method) + { + if ($_SESSION["uid"]) { + return true; + } + return false; + } - function hook_article_filter($article) - { - global $fetch_last_content_type; + function after() + { + return true; + } - $json_conf = $this->host->get($this, 'json_conf'); - $owner_uid = $article['owner_uid']; - $data = json_decode($json_conf, true); + function hook_article_filter($article) + { + global $fetch_last_content_type; - if (!is_array($data)) { - // no valid JSON or no configuration at all - return $article; - } + $owner_uid = $article['owner_uid']; - foreach ($data as $urlpart=>$config) { - if (strpos($article['link'], $urlpart) === false) continue; // skip this config if URL not matching - if (strpos($article['plugin_data'], "feedmod,$owner_uid:") !== false) { - // do not process an article more than once - if (isset($article['stored']['content'])) $article['content'] = $article['stored']['content']; - break; - } + // + // Load mods if they are not already loaded + // + if (!$this->mods_loaded) { + // + // Reading mod files + // - switch ($config['type']) { - case 'xpath': - $doc = new DOMDocument(); - $link = trim($article['link']); + $this->mods = array(); + // bad (!) hardcoded path + $mod_files = glob('plugins/af_feedmod/mods/*.json'); + foreach ($mod_files as $file) { + $json = file_get_contents($file); + $mod = json_decode($json, true); + if (json_last_error() != JSON_ERROR_NONE) + continue; + + if (!isset($mod['match']) || !isset($mod['config'])) + continue; - if (version_compare(VERSION, '1.7.9', '>=')) { - $html = fetch_file_contents($link); - $content_type = $fetch_last_content_type; - } else { - // fallback to file_get_contents() - $html = file_get_contents($link); + $this->mods[$mod['match']] = $mod['config']; + } - // try to fetch charset from HTTP headers - $headers = $http_response_header; - $content_type = false; - foreach ($headers as $h) { - if (substr(strtolower($h), 0, 13) == 'content-type:') { - $content_type = substr($h, 14); - // don't break here to find LATEST (if redirected) entry - } - } - } - - $charset = false; - if (!isset($config['force_charset'])) { - if ($content_type) { - preg_match('/charset=(\S+)/', $content_type, $matches); - if (isset($matches[1]) && !empty($matches[1])) $charset = $matches[1]; - } - } else { - // use forced charset - $charset = $config['force_charset']; - } - - if ($charset && isset($config['force_unicode']) && $config['force_unicode']) { - $html = iconv($charset, 'utf-8', $html); - $charset = 'utf-8'; - } - - if ($charset) { - $html = '' . $html; - } - - - - + // + // User mods + // - @$doc->loadHTML($html); + $json_conf = $this->host->get($this, 'json_conf'); + $user_mods = json_decode($json_conf, true); + if (is_array($user_mods)) + $this->mods = array_merge($this->mods, $user_mods); - if ($doc) { - $basenode = false; - $xpath = new DOMXPath($doc); - $entries = $xpath->query('(//'.$config['xpath'].')'); // find main DIV according to config + $this->mods_loaded = true; + } - if ($entries->length > 0) $basenode = $entries->item(0); + // article is already fetched + if (strpos($article['plugin_data'], "feedmod,$owner_uid:") !== false && isset($article['stored']['content'])) + { + $article['content'] = $article['stored']['content']; + return $article; + } - if ($basenode) { - // remove nodes from cleanup configuration - if (isset($config['cleanup'])) { - if (!is_array($config['cleanup'])) { - $config['cleanup'] = array($config['cleanup']); - } - foreach ($config['cleanup'] as $cleanup) { - $nodelist = $xpath->query('//'.$cleanup, $basenode); - foreach ($nodelist as $node) { - if ($node instanceof DOMAttr) { - $node->ownerElement->removeAttributeNode($node); - } - else { - $node->parentNode->removeChild($node); - } - } - } - } - $article['content'] = $doc->saveXML($basenode); - $article['plugin_data'] = "feedmod,$owner_uid:" . $article['plugin_data']; - } - } - break; + foreach ($this->mods as $urlpart=>$config) { + if (strpos($article['link'], $urlpart) === false) continue; - default: - // unknown type or invalid config - continue; - } + $content = fetch_article($article, $config); + if (!$content) + break; - break; // if we got here, we found the correct entry in $data, do not process more - } + $article['content'] = $content; + $article['plugin_data'] = "feedmod,$owner_uid:" . $article['plugin_data']; + } - return $article; - } + return $article; + } - function hook_prefs_tabs($args) - { - print '
'; - } + function hook_prefs_tabs($args) + { + print '
'; + } - function index() - { - $pluginhost = PluginHost::getInstance(); - $json_conf = $pluginhost->get($this, 'json_conf'); + function index() + { + $pluginhost = PluginHost::getInstance(); + $json_conf = $pluginhost->get($this, 'json_conf'); - print "
"; + print ""; - print ""; + print ""; - print ""; - print ""; - print ""; + print ""; + print ""; + print ""; - print "
"; - print ""; - print "
"; + print "
"; + print ""; + print "
"; - print "

"; + print "

"; - print "

"; - } + print ""; + } - function save() - { - $json_conf = $_POST['json_conf']; + function save() + { + $json_conf = $_POST['json_conf']; - if (is_null(json_decode($json_conf))) { - echo __("error: Invalid JSON!"); - return false; - } - - $this->host->set($this, 'json_conf', $json_conf); - echo __("Configuration saved."); - } + if (is_null(json_decode($json_conf))) { + echo __("error: Invalid JSON!"); + return false; + } + $this->host->set($this, 'json_conf', $json_conf); + echo __("Configuration saved."); + } } diff --git a/mods/4player.de.json b/mods/4player.de.json new file mode 100644 index 0000000..32e9480 --- /dev/null +++ b/mods/4player.de.json @@ -0,0 +1,12 @@ +{ + "name": "4players.de", + "author": "boxdot", + "feed": "http://feeds.4players.de/Allgemein/news/-/rss.xml", + "match": "4players.de", + "config": { + "type": "xpath", + "xpath": "article", + "force_charset": "utf-8", + "cleanup": ["header", "footer", "div[contains(@class, 'social-facebook')]"] + } +} diff --git a/mods/visions.de.json b/mods/visions.de.json new file mode 100644 index 0000000..f904475 --- /dev/null +++ b/mods/visions.de.json @@ -0,0 +1,11 @@ +{ + "name": "VISIONS.de", + "author": "boxdot", + "feed": "http://rss.feedsportal.com/c/32350/f/443184/index.rss", + "match": "visions0Bde0C", + "config": { + "type": "xpath", + "xpath": "div[contains(@class, 'marginbt')]", + "force_charset": "utf-8" + } +} diff --git a/tests/xpath.php b/tests/xpath.php index 4e422ad..130f288 100644 --- a/tests/xpath.php +++ b/tests/xpath.php @@ -1,33 +1,37 @@ 'xpath', - 'xpath' => 'div[@itemprop="articleBody"]', -); +require_once('../fetch.php'); -$article = array( - 'link' => 'http://www.der-postillon.com/2013/04/nordkoreas-armee-nach-wochenlangem.html', - 'content' => 'This is the feed content', - 'plugin_data' => '', -); - -$doc = new DOMDocument(); -$html = file_get_contents($article['link']); -$doc->loadHTML($html); - -if ($doc) { - $basenode = false; - $xpath = new DOMXPath($doc); - $entries = $xpath->query('(//'.$config['xpath'].')'); // find main DIV according to config - - var_dump($entries); - - if ($entries->length > 0) $basenode = $entries->item(0); - - if ($basenode) { - $article['content'] = $doc->saveXML($basenode); - $article['plugin_data'] = "feedmod,$owner_uid:" . $article['plugin_data']; - } +if (count($argv) <= 2) { + echo 'USAGE: php fetch.php [mod_file] [article_url]' . PHP_EOL; + exit(1); } -print_r($article); \ No newline at end of file +$mod = $argv[1]; +$article_url = $argv[2]; + +// +// Getting json config +// + +$json = file_get_contents($mod); +$data = json_decode($json, true); + +echo "
";
+print_r($data);
+echo "
"; + +if (json_last_error() != JSON_ERROR_NONE) { + echo 'Json error' . PHP_EOL; + exit(1); +} + +$config = $data['config']; + +// +// Fetching article +// + +$owner_uid = 100; +$article = array( 'link' => $article_url, 'plugin_data' => '' ); +echo fetch_article($article, $config)['content'];