anshu-krishna
diff --git a/‎.gitattributes‎
Lines changed: 0 additions & 2 deletions b/‎.gitattributes‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎.gitignore‎
Lines changed: 12 additions & 0 deletions b/‎.gitignore‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎DOC.md‎
Lines changed: 12 additions & 12 deletions b/‎DOC.md‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎LICENSE‎
Lines changed: 1 addition & 1 deletion b/‎LICENSE‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 11 additions & 7 deletions b/‎README.md‎
Lines changed: 11 additions & 7 deletions
diff --git a/‎composer.json‎
Lines changed: 21 additions & 0 deletions b/‎composer.json‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎example/.gitignore‎
Lines changed: 12 additions & 0 deletions b/‎example/.gitignore‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎example/composer.json‎
Lines changed: 24 additions & 0 deletions b/‎example/composer.json‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎example/example.php‎
Lines changed: 17 additions & 16 deletions b/‎example/example.php‎
Lines changed: 17 additions & 16 deletions
diff --git a/‎example/example_css_to_xpath.php‎
Lines changed: 5 additions & 2 deletions b/‎example/example_css_to_xpath.php‎
Lines changed: 5 additions & 2 deletions
@@ -0,0 +1,12 @@
+composer.phar
+composer.lock
+/vendor/
+
+# Node artifact files
+**/node_modules/
+
+# Generated by MacOS
+.DS_Store
+
+# Generated by Windows
+Thumbs.db
@@ -1,16 +1,16 @@
-# Class HTML_Scraper
+# Class HTMLScraper
 ### Static Functions:
 -`new_from($source)`
 
-Create a new HTML_Scraper object from the passed source. 
+Create a new HTMLScraper object from the passed source. 
 `$source` can be of type `DOMNodeList`, `DOMNode` or `string`.
 
 **Returns:** 
 
 | Type | Description |
 |---|---|
-| `array` | When `$source` is an instance of `DOMNodeList` then returns an `array` of `HTML_Scraper` objects. |
-| `HTML_Scraper` | When `$source` is an instance of `DOMNode` or a `string` |
+| `array` | When `$source` is an instance of `DOMNodeList` then returns an `array` of `HTMLScraper` objects. |
+| `HTMLScraper` | When `$source` is an instance of `DOMNode` or a `string` |
 
 
 -`CSS_to_Xpath(string $path) : string`
@@ -20,7 +20,7 @@
 ### Functions:
 -`__toString() : string`
 
-Magic function to convert `HTML_Scraper` into a `string` containing the HTML code of the loaded document.
+Magic function to convert `HTMLScraper` into a `string` containing the HTML code of the loaded document.
 
 
 -`textContent() : string`
@@ -43,7 +43,7 @@
 Load HTML from a file.
 
 -	`$options` 
-*see `$options` in `HTML_Scraper->load_HTML_str()`*
+*see `$options` in `HTMLScraper->load_HTML_str()`*
 
 -	`$context` 
 *see `$context` in `stream_context_create()`*
@@ -74,11 +74,11 @@
 
 -`querySelector(string $selector, int ...$items)`
 
-Same as `HTML_Scraper->xpath()` except that it uses CSS selector instead of *XPath* path expression.
+Same as `HTMLScraper->xpath()` except that it uses CSS selector instead of *XPath* path expression.
 
 -`xpath_extract($mapper, string $expr, int ...$items)`
 
-Find `DOMNode`(s) in the same way as in `HTML_Scraper->xpath()` then extract data from the `DOMNode`(s) as specified by the `$mapper`.
+Find `DOMNode`(s) in the same way as in `HTMLScraper->xpath()` then extract data from the `DOMNode`(s) as specified by the `$mapper`.
 
 -	`$mapper` 
 It can be any one of the `string` specified below or a `function` that takes a `DOMNode` and returns any extracted value. 
@@ -91,7 +91,7 @@
 
 -`querySelector_extract($mapper, string $selector, int ...$items)`
 
-Same as `HTML_Scraper->xpath_extract()` except that it uses CSS selector instead of *XPath* path expression.
+Same as `HTMLScraper->xpath_extract()` except that it uses CSS selector instead of *XPath* path expression.
 
 ---
 
@@ -111,7 +111,7 @@
 
 -`xpath(DOMNode &$node, string $expr, int ...$items)`
 
-Similar to `HTML_Scraper->xpath()` except that it works on a `DOMNode` instead of the `HTML_Scraper`'s `DOMDocument`.
+Similar to `HTMLScraper->xpath()` except that it works on a `DOMNode` instead of the `HTMLScraper`'s `DOMDocument`.
 
 -`querySelector(DOMNode &$node, string $selector, int ...$items)`
 
@@ -122,7 +122,7 @@
 Get one or more child nodes of the `DOMNode`.
 
 -	`$indexes` 
-*See `$items` in `HTML_Scraper->expath()`.*
+*See `$items` in `HTMLScraper->xpath()`.*
 
 **Returns:**
 
@@ -154,4 +154,4 @@
 Removes the child elements of the passed `DOMNode` specified by the `...$indexes`.
 
 -	`$indexes` 
-*See `$items` in `HTML_Scraper->expath()`.*
+*See `$items` in `HTMLScraper->xpath()`.*
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2018 Anshu Krishna
+Copyright (c) 2021 Anshu Krishna
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 
@@ -18,9 +18,14 @@ For *basic* documentation see the DOC file.
 ### Example
 ```php
 <?php
-require_once 'HTML_Scraper.php';
+require_once 'vendor/autoload.php';
 
-$doc = new HTML_Scraper;
+use Krishna\DOMNodeHelper;
+use Krishna\HTMLScraper;
+
+const TrimmedText = HTMLScraper::Extract_textContentTrim;
+
+$doc = new HTMLScraper();
 
 if(!$doc->load_HTML_file('https://www.royalroad.com/fiction/10073/the-wandering-inn')) {
 echo 'Unable to load data';
@@ -29,18 +34,17 @@ if(!$doc->load_HTML_file('https://www.royalroad.com/fiction/10073/the-wandering-
 
 $data = [];
 
-$data['title'] = $doc->querySelector_extract('textContentTrim', 'div.fic-title h1[property="name"]', 0);
+$data['title'] = $doc->querySelector_extract(TrimmedText, 'div.fic-title h1[property="name"]', 0);
 
 $data['url'] = $doc->xpath_extract(function($meta) {
 return $meta->getAttribute('content');
 }, '//meta[@property="og:url"]', 0);
 
-$data['description'] = $doc->querySelector_extract(function(&$div) {
+$data['description'] = htmlspecialchars($doc->querySelector_extract(function(&$div) {
 return trim(DOMNodeHelper::innerHTML($div));
-}, 'div.description div[property="description"]', 0);
+}, 'div.description div[property="description"]', 0));
 
-$data['tags'] = $doc->querySelector_extract('textContentTrim', 'span.tags span[property="genre"]');
+$data['tags'] = $doc->querySelector_extract(TrimmedText, 'span.tags span[property="genre"]');
 
 var_dump($data);
-?>
 ```
@@ -0,0 +1,21 @@
+{
+ "name": "anshu-krishna/html-scraper",
+ "description": "A set of PHP classes to simplify data extraction from HTML.",
+ "type": "library",
+ "license": "MIT",
+ "authors": [
+ {
+ "name": "Anshu Krishna",
+ "email": "anshu.krishna5@gmail.com"
+ }
+ ],
+ "version": "3.5.0",
+ "require": {
+ "php": ">=8.0.0"
+ },
+ "autoload": {
+ "psr-4": {
+ "Krishna\\": "src"
+ }
+ }
+}
@@ -0,0 +1,12 @@
+composer.phar
+composer.lock
+/vendor/
+
+# Node artifact files
+**/node_modules/
+
+# Generated by MacOS
+.DS_Store
+
+# Generated by Windows
+Thumbs.db
@@ -0,0 +1,24 @@
+{
+ "name": "anshu-krishna/scraper-example",
+ "description": "Examples for HTMLScraper",
+ "type": "project",
+ "license": "MIT",
+ "authors": [
+ {
+ "name": "Anshu Krishna",
+ "email": "anshu.krishna5@gmail.com"
+ }
+ ],
+ "version": "1.0.0",
+ "repositories": [{
+ "type": "path",
+ "url": "..",
+ "options": {
+ "symlink": true
+ }
+ }],
+ "require": {
+ "php": ">=8.0.0",
+ "anshu-krishna/html-scraper" : "*"
+ }
+}
@@ -1,15 +1,18 @@
+<style>
+pre {
+white-space: pre-wrap;
+}
+</style>
+<pre>
 <?php
-// function echo_html(string $html_source) {
-//	echo '<pre>', htmlentities($html_source), '</pre>';
-// }
+require_once 'vendor/autoload.php';
 
-// function echo_DOMNode(DOMNode $node) {
-//	echo_html(DOMNodeHelper::outerHTML($node));
-// }
+use Krishna\DOMNodeHelper;
+use Krishna\HTMLScraper;
 
-require_once '../HTML_Scraper.php';
+const TrimmedText = HTMLScraper::Extract_textContentTrim;
 
-$doc = new HTML_Scraper;
+$doc = new HTMLScraper();
 
 if(!$doc->load_HTML_file('sample_data_file.html')) {
 echo 'Unable to load data';
@@ -26,13 +29,13 @@
 
 $data = [];
 
-$data['title'] = $doc->querySelector_extract('textContentTrim', 'div.fic-title h1[property="name"]', 0);
+$data['title'] = $doc->querySelector_extract(TrimmedText, 'div.fic-title h1[property="name"]', 0);
 
 $data['url'] = $doc->xpath_extract(function($meta) {
 return $meta->getAttribute('content');
 }, '//meta[@property="og:url"]', 0);
 
-$data['auth'] = $doc->querySelector_extract('textContentTrim', 'div.fic-title h4[property="author"] span[property="name"]', 0);
+$data['auth'] = $doc->querySelector_extract(TrimmedText, 'div.fic-title h4[property="author"] span[property="name"]', 0);
 
 $data['auth_link'] = $doc->querySelector_extract(function(&$a) {
 return 'https://www.royalroad.com' . $a->getAttribute('href');
@@ -56,11 +59,11 @@
 return 275 * $pages;
 }, 'li[property="numberOfPages"]', 0);
 
-$data['desc'] = $doc->querySelector_extract(function(&$div) {
+$data['desc'] = htmlspecialchars($doc->querySelector_extract(function(&$div) {
 return trim(DOMNodeHelper::innerHTML($div));
-}, 'div.description div[property="description"]', 0);
+}, 'div.description div[property="description"]', 0));
 
-$data['tags'] = $doc->querySelector_extract('textContentTrim', 'span.tags span[property="genre"]');
+$data['tags'] = $doc->querySelector_extract(TrimmedText, 'span.tags span[property="genre"]');
 
 $replace = NULL;
 if($data['url'] !== NULL && preg_match("/http[s]?:\/\/www\.royalroad\.com\/(.+)\/?/", $data['url'], $mtc)) {
@@ -85,6 +88,4 @@
 if(is_array($data['ch_links'])) {
 $data['chaps'] = count($data['ch_links']);
 }
-
-var_dump($data);
-?>
+echo json_encode($data, JSON_PRETTY_PRINT | JSON_INVALID_UTF8_SUBSTITUTE | JSON_PARTIAL_OUTPUT_ON_ERROR);
@@ -25,7 +25,10 @@
 <header>CSS</header>
 <header>XPath</header>
 <?php
-require_once '../html_scraper.php';
+require_once 'vendor/autoload.php';
+
+use Krishna\HTMLScraper;
+
 $examples = [
 'div',
 'div.abc',
@@ -47,7 +50,7 @@
 $examples = array_map(function($selector) {
 return implode(PHP_EOL, array_map(function($str) {
 return "<span>" . htmlspecialchars($str) . "</span>";
-}, [$selector, HTML_Scraper::CSS_to_Xpath($selector)]));
+}, [$selector, HTMLScraper::CSS_to_Xpath($selector)]));
 }, $examples);
 
 echo implode(PHP_EOL, $examples);