Skip to content

Commit

Permalink
Merge pull request #1 from base-cms/clean-elements
Browse files Browse the repository at this point in the history
Add unit tests and update pennwell default rule
  • Loading branch information
zarathustra323 authored Apr 9, 2019
2 parents 1bcec43 + 4a536e1 commit 2e199eb
Show file tree
Hide file tree
Showing 19 changed files with 1,579 additions and 36 deletions.
23 changes: 21 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
"lint": "node_modules/.bin/gulp lint",
"dev": "node_modules/.bin/gulp",
"serve": "NODE_ENV=production node src/index",
"test": "node_modules/.bin/gulp lint"
"test": "node_modules/.bin/gulp lint && node_modules/.bin/mocha",
"coverage": "node_modules/.bin/nyc node_modules/.bin/mocha"
},
"dependencies": {
"@base-cms/object-path": "^0.6.0",
Expand All @@ -20,11 +21,29 @@
"express": "^4.16.4"
},
"devDependencies": {
"chai": "^4.2.0",
"chai-as-promised": "^7.1.1",
"eslint": "^5.16.0",
"eslint-config-airbnb-base": "^13.1.0",
"eslint-plugin-import": "^2.16.0",
"gulp": "^4.0.0",
"gulp-cached": "^1.1.1",
"gulp-eslint": "^5.0.0"
"gulp-eslint": "^5.0.0",
"mocha": "^6.1.2",
"nyc": "^13.3.0",
"sinon": "^7.3.1",
"supertest": "^4.0.2"
},
"nyc": {
"check-coverage": false,
"per-file": true,
"reporter": [
"text"
],
"all": true,
"cache": false,
"include": [
"src/**/*.js"
]
}
}
5 changes: 5 additions & 0 deletions scripts/coverage.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash
docker-compose run \
--rm \
yarn \
run coverage $@
5 changes: 5 additions & 0 deletions scripts/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash
docker-compose run \
--rm \
yarn \
run test $@
3 changes: 3 additions & 0 deletions src/rules/pennwell/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ To access, send a `POST` request to `/pennwell/default`. This rule set performs
- Extracts the `deck` text from elements classed with `.paraStyle_headline_deck` and removes the element from the cleaned HTML.
- Extracts an `author` object from elements classed with `.paraStyle_byline` or `.paraStyle_body_bio` and removes the elements from the cleaned HTML.
- If an `<h1>` is detected anywhere in the body, all heading elements are increased by one (e.g. `<h1>` becomes `<h2>`, `<h2>` becomes `<h3>`, etc).
- Removes all `<form>` and `<style>` elements.
- Removes all `id`, `class`, `style` and `data-*` attributes from elements.
- Removes PennNet.com iframe embeds, e.g. where `iframe[src*="pennnet.com"]`.

### Examples

Expand Down
41 changes: 36 additions & 5 deletions src/rules/pennwell/default.js
Original file line number Diff line number Diff line change
@@ -1,20 +1,42 @@
const cheerio = require('cheerio');
const adjustHeadings = require('../../utils/adjust-headings');
const removeElements = require('../../utils/remove-elements');
const stripWhitespace = require('../../utils/strip-whitespace');
const removeAttributes = require('../../utils/remove-attributes');
const removeDataAttributes = require('../../utils/remove-data-attributes');

const removeAttrs = ($) => {
// Remove class, id, and style attributes.
removeAttributes($, ['class', 'id', 'style']);
// Remove all data attributes.
removeDataAttributes($);
};

const loadHTML = html => cheerio.load(html, { decodeEntities: false });

const cleanTextValue = v => (v || '').replace(/\s+/g, ' ').trim();

const extractDeck = ($) => {
const className = '.paraStyle_headline_deck';
const element = $(className);
if (!element.length) return null;
const deck = (element.text() || '').trim() || null;
const deck = cleanTextValue(element.text()) || null;
element.replaceWith('');
return deck;
};

const cleanBio = (bio) => {
if (!bio) return null;
const $ = loadHTML(bio);
removeAttrs($);
return $('body').html();
};

const extractAuthor = ($) => {
const bylineClass = '.paraStyle_byline';
const bioClass = '.paraStyle_body_bio';

const name = ($(bylineClass).text() || '').trim().replace(/^by/i, '').trim();
const name = cleanTextValue($(bylineClass).text()).replace(/^by/i, '').trim();

let image = null;
let bio = '';
Expand All @@ -33,19 +55,28 @@ const extractAuthor = ($) => {
return {
name: name || null,
image: image || null,
bio: bio || null,
bio: cleanBio(bio),
};
};

module.exports = async (body) => {
const html = (body || '').replace(/\s\s+/g, '');
const $ = cheerio.load(html, { decodeEntities: false });
const html = stripWhitespace(body);
const $ = loadHTML(html);

const deck = extractDeck($);
const author = extractAuthor($);

adjustHeadings($);

// Remove form elements.
removeElements($, 'form, style');

// Remove buyer's guide iframe search embeds.
removeElements($, 'iframe[src*="pennnet.com"]');

// Remove attributes.
removeAttrs($);

return {
extracted: {
deck,
Expand Down
10 changes: 10 additions & 0 deletions src/utils/remove-attributes.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
const { isArray } = Array;

module.exports = ($, attributes) => {
const attrs = isArray(attributes) ? attributes : [];
attrs.forEach((attr) => {
$(`[${attr}]`).each(function () {
$(this).removeAttr(attr);
});
});
};
10 changes: 10 additions & 0 deletions src/utils/remove-data-attributes.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
module.exports = ($) => {
$('*').each(function () {
const { attribs } = $(this)[0];
Object.keys(attribs).forEach((attr) => {
if (/^data-/i.test(attr)) {
$(this).removeAttr(attr);
}
});
});
};
5 changes: 5 additions & 0 deletions src/utils/remove-elements.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
module.exports = ($, selector) => {
$(selector).each(function () {
$(this).replaceWith('');
});
};
28 changes: 28 additions & 0 deletions src/utils/strip-whitespace.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
const cheerio = require('cheerio');

const cleanElementText = ($, $el) => {
const contents = $el.contents()[0];

if (contents && contents.type === 'text') {
const { data = '' } = contents;
const cleaned = data.replace(/\s\s+/g, ' ').trim();
contents.data = cleaned;
}
if ($el.children().length) {
$el.children().each(function () {
cleanElementText($, $(this));
});
}
};

module.exports = (html) => {
const str = (html || '')
.replace(/[\r\n\f\v\t\b\\]/g, ' ')
.trim()
.replace(/>\s+</g, '><')
.replace(/\s+%{\[/g, '%{[')
.replace(/\]}%\s+/g, ']}%');
const $ = cheerio.load(str, { decodeEntities: false });
cleanElementText($, $('body'));
return $('body').html();
};
7 changes: 7 additions & 0 deletions test/.eslintrc.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
module.exports = {
globals: {
describe: 'readonly',
it: 'readonly',
expect: 'readonly',
},
};
12 changes: 12 additions & 0 deletions test/bootstrap.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
const chai = require('chai');
const request = require('supertest');
const sinon = require('sinon');
const chaiAsPromised = require('chai-as-promised');

global.chai = chai;
global.request = request;
global.sinon = sinon;
global.chai = chai;
global.expect = chai.expect;

chai.use(chaiAsPromised);
4 changes: 4 additions & 0 deletions test/mocha.opts
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
--recursive
--async-only
--timeout 2000
--require ./test/bootstrap.js
Loading

0 comments on commit 2e199eb

Please sign in to comment.