1+ <style>
2+ pre {
3+ white-space: pre-wrap;
4+ }
5+ </style>
6+ <pre>
17<?php
2- // function echo_html(string $html_source) {
3- // echo '<pre>', htmlentities($html_source), '</pre>';
4- // }
8+ require_once 'vendor/autoload.php ' ;
59
6- // function echo_DOMNode(DOMNode $node) {
7- // echo_html(DOMNodeHelper::outerHTML($node));
8- // }
10+ use Krishna \DOMNodeHelper ;
11+ use Krishna \HTMLScraper ;
912
10- require_once ' ../HTML_Scraper.php ' ;
13+ const TrimmedText = HTMLScraper::Extract_textContentTrim ;
1114
12- $ doc = new HTML_Scraper ;
15+ $ doc = new HTMLScraper () ;
1316
1417if (!$ doc ->load_HTML_file ('sample_data_file.html ' )) {
1518echo 'Unable to load data ' ;
2629
2730$ data = [];
2831
29- $ data ['title ' ] = $ doc ->querySelector_extract (' textContentTrim ' , 'div.fic-title h1[property="name"] ' , 0 );
32+ $ data ['title ' ] = $ doc ->querySelector_extract (TrimmedText , 'div.fic-title h1[property="name"] ' , 0 );
3033
3134$ data ['url ' ] = $ doc ->xpath_extract (function ($ meta ) {
3235return $ meta ->getAttribute ('content ' );
3336}, '//meta[@property="og:url"] ' , 0 );
3437
35- $ data ['auth ' ] = $ doc ->querySelector_extract (' textContentTrim ' , 'div.fic-title h4[property="author"] span[property="name"] ' , 0 );
38+ $ data ['auth ' ] = $ doc ->querySelector_extract (TrimmedText , 'div.fic-title h4[property="author"] span[property="name"] ' , 0 );
3639
3740$ data ['auth_link ' ] = $ doc ->querySelector_extract (function (&$ a ) {
3841return 'https://www.royalroad.com ' . $ a ->getAttribute ('href ' );
5659return 275 * $ pages ;
5760}, 'li[property="numberOfPages"] ' , 0 );
5861
59- $ data ['desc ' ] = $ doc ->querySelector_extract (function (&$ div ) {
62+ $ data ['desc ' ] = htmlspecialchars ( $ doc ->querySelector_extract (function (&$ div ) {
6063return trim (DOMNodeHelper::innerHTML ($ div ));
61- }, 'div.description div[property="description"] ' , 0 );
64+ }, 'div.description div[property="description"] ' , 0 )) ;
6265
63- $ data ['tags ' ] = $ doc ->querySelector_extract (' textContentTrim ' , 'span.tags span[property="genre"] ' );
66+ $ data ['tags ' ] = $ doc ->querySelector_extract (TrimmedText , 'span.tags span[property="genre"] ' );
6467
6568$ replace = NULL ;
6669if ($ data ['url ' ] !== NULL && preg_match ("/http[s]?:\/\/www\.royalroad\.com\/(.+)\/?/ " , $ data ['url ' ], $ mtc )) {
8588if (is_array ($ data ['ch_links ' ])) {
8689$ data ['chaps ' ] = count ($ data ['ch_links ' ]);
8790}
88-
89- var_dump ($ data );
90- ?>
91+ echo json_encode ($ data , JSON_PRETTY_PRINT | JSON_INVALID_UTF8_SUBSTITUTE | JSON_PARTIAL_OUTPUT_ON_ERROR );
0 commit comments