From 4823777c75498c29313303be463bfc6687be4021 Mon Sep 17 00:00:00 2001 From: Jacob Bare Date: Wed, 10 Apr 2019 12:15:01 -0500 Subject: [PATCH 1/3] Adjust headings on h1 or h2 and increment by 2 --- src/utils/adjust-headings.js | 5 +++-- test/utils/adjust-headings.spec.js | 35 ++++++++++++++++++++++++------ 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/src/utils/adjust-headings.js b/src/utils/adjust-headings.js index d497b10..5793523 100644 --- a/src/utils/adjust-headings.js +++ b/src/utils/adjust-headings.js @@ -3,11 +3,12 @@ const cheerio = require('cheerio'); const selector = 'h1, h2, h3, h4, h5'; module.exports = ($) => { - if ($('h1').length) { + if ($('h1').length || $('h2').length) { $(selector).each(function () { const tag = $(this).prop('tagName').toLowerCase(); const [, num] = [...tag]; - const newTag = `h${Number(num) + 1}`; + const n = Number(num); + const newTag = `h${n < 5 ? n + 2 : n + 1}`; const { attribs } = $(this)[0]; const $new = cheerio.load(`<${newTag}>${$(this).html()}`)('span'); diff --git a/test/utils/adjust-headings.spec.js b/test/utils/adjust-headings.spec.js index de5eac0..e9a7b3d 100644 --- a/test/utils/adjust-headings.spec.js +++ b/test/utils/adjust-headings.spec.js @@ -20,18 +20,39 @@ describe('utils/adjust-headings', () => { const $ = cheerio.load(body); adjustHeadings($); expect($('h1').length).to.equal(0, 'Number of

elements'); - expect($('h2').length).to.equal(1, 'Number of

elements'); - expect($('h3').length).to.equal(2, 'Number of

elements'); + expect($('h2').length).to.equal(0, 'Number of

elements'); + expect($('h3').length).to.equal(1, 'Number of

elements'); expect($('h4').length).to.equal(2, 'Number of

elements'); - expect($('h5').length).to.equal(1, 'Number of

elements'); - expect($('h6').length).to.equal(2, 'Number of
elements'); + expect($('h5').length).to.equal(2, 'Number of
elements'); + expect($('h6').length).to.equal(3, 'Number of
elements'); }); - it('should not adjust heading elements when an

is not present.', async () => { + it('should adjust heading elements when an

is present.', async () => { const body = `
-

Foo

Bar

Bar

+
+

Foo

+

Foo

+

Foo

+
Foo
+
Foo
+
+
+ `; + const $ = cheerio.load(body); + adjustHeadings($); + expect($('h1').length).to.equal(0, 'Number of

elements'); + expect($('h2').length).to.equal(0, 'Number of

elements'); + expect($('h3').length).to.equal(0, 'Number of

elements'); + expect($('h4').length).to.equal(2, 'Number of

elements'); + expect($('h5').length).to.equal(2, 'Number of

elements'); + expect($('h6').length).to.equal(3, 'Number of
elements'); + }); + + it('should not adjust heading elements when an

or

is not present.', async () => { + const body = ` +

Foo

Foo

@@ -44,7 +65,7 @@ describe('utils/adjust-headings', () => { const $ = cheerio.load(body); adjustHeadings($); expect($('h1').length).to.equal(0, 'Number of

elements'); - expect($('h2').length).to.equal(3, 'Number of

elements'); + expect($('h2').length).to.equal(0, 'Number of

elements'); expect($('h3').length).to.equal(2, 'Number of

elements'); expect($('h4').length).to.equal(1, 'Number of

elements'); expect($('h5').length).to.equal(1, 'Number of

elements'); From 3ee8598dcba2ef0458ce3ed2c2b0557c80ff4eac Mon Sep 17 00:00:00 2001 From: Jacob Bare Date: Wed, 10 Apr 2019 12:15:10 -0500 Subject: [PATCH 2/3] Do not extract values from HTML --- src/rules/pennwell/default.js | 97 ++++++++--------- test/rules/pennwell/default.spec.js | 163 ++++++++++++++-------------- 2 files changed, 128 insertions(+), 132 deletions(-) diff --git a/src/rules/pennwell/default.js b/src/rules/pennwell/default.js index 35023e1..bd307a4 100644 --- a/src/rules/pennwell/default.js +++ b/src/rules/pennwell/default.js @@ -14,57 +14,57 @@ const removeAttrs = ($) => { const loadHTML = html => cheerio.load(html, { decodeEntities: false }); -const cleanTextValue = v => (v || '').replace(/\s+/g, ' ').trim(); - -const extractDeck = ($) => { - const className = '.paraStyle_headline_deck'; - const element = $(className); - if (!element.length) return null; - const deck = cleanTextValue(element.text()) || null; - element.replaceWith(''); - return deck; -}; - -const cleanBio = (bio) => { - if (!bio) return null; - const $ = loadHTML(bio); - removeAttrs($); - return $('body').html(); -}; - -const extractAuthor = ($) => { - const bylineClass = '.paraStyle_byline'; - const bioClass = '.paraStyle_body_bio'; - - const name = cleanTextValue($(bylineClass).text()).replace(/^by/i, '').trim(); - - let image = null; - let bio = ''; - - $(bioClass).each(function () { - const imgElement = $(this).children('img'); - if (imgElement.length) { - image = imgElement.attr('src'); - } else { - bio = `${bio}

${$(this).html()}

`; - } - }); - - $(bylineClass).replaceWith(''); - $(bioClass).replaceWith(''); - return { - name: name || null, - image: image || null, - bio: cleanBio(bio), - }; -}; +// const cleanTextValue = v => (v || '').replace(/\s+/g, ' ').trim(); + +// const extractDeck = ($) => { +// const className = '.paraStyle_headline_deck'; +// const element = $(className); +// if (!element.length) return null; +// const deck = cleanTextValue(element.text()) || null; +// element.replaceWith(''); +// return deck; +// }; + +// const cleanBio = (bio) => { +// if (!bio) return null; +// const $ = loadHTML(bio); +// removeAttrs($); +// return $('body').html(); +// }; + +// const extractAuthor = ($) => { +// const bylineClass = '.paraStyle_byline'; +// const bioClass = '.paraStyle_body_bio'; + +// const name = cleanTextValue($(bylineClass).text()).replace(/^by/i, '').trim(); + +// let image = null; +// let bio = ''; + +// $(bioClass).each(function () { +// const imgElement = $(this).children('img'); +// if (imgElement.length) { +// image = imgElement.attr('src'); +// } else { +// bio = `${bio}

${$(this).html()}

`; +// } +// }); + +// $(bylineClass).replaceWith(''); +// $(bioClass).replaceWith(''); +// return { +// name: name || null, +// image: image || null, +// bio: cleanBio(bio), +// }; +// }; module.exports = async (body) => { const html = stripWhitespace(body); const $ = loadHTML(html); - const deck = extractDeck($); - const author = extractAuthor($); + // const deck = extractDeck($); + // const author = extractAuthor($); adjustHeadings($); @@ -78,10 +78,7 @@ module.exports = async (body) => { removeAttrs($); return { - extracted: { - deck, - author, - }, + extracted: {}, html: { cleaned: $('body').html(), original: body, diff --git a/test/rules/pennwell/default.spec.js b/test/rules/pennwell/default.spec.js index 2d55a8d..8493b1d 100644 --- a/test/rules/pennwell/default.spec.js +++ b/test/rules/pennwell/default.spec.js @@ -1,3 +1,5 @@ +/* eslint-disable max-len */ + const rule = require('../../../src/rules/pennwell/default'); describe('rules/pennwell/default', () => { @@ -88,14 +90,11 @@ describe('rules/pennwell/default', () => {
`; const result = await rule(body); - expect(result.html.cleaned).to.equal('

Foo

Bar

Bar

Foo

Foo

Foo
Foo
Foo
'); + expect(result.html.cleaned).to.equal('

Foo

Bar

Bar

Foo
Foo
Foo
Foo
Foo
'); }); - it('should not adjust heading elements when an

is not present.', async () => { + it('should not adjust heading elements when an

or

is not present.', async () => { const body = `
-

Foo

-

Bar

-

Bar

Foo

Foo

@@ -106,7 +105,7 @@ describe('rules/pennwell/default', () => {
`; const result = await rule(body); - expect(result.html.cleaned).to.equal('

Foo

Bar

Bar

Foo

Foo

Foo

Foo
Foo
'); + expect(result.html.cleaned).to.equal('

Foo

Foo

Foo

Foo
Foo
'); }); it('should remove `class` attributes.', async () => { const body = ` @@ -144,80 +143,80 @@ describe('rules/pennwell/default', () => { const result = await rule(body); expect(result.html.cleaned).to.equal('
Bar
'); }); - it('should extract a deck value when present.', async () => { - const body = ` -
-

Put Drivers in - Safe Hands with Telematics

-

Foo

-
- `; - const result = await rule(body); - expect(result.extracted.deck).to.equal('Put Drivers in Safe Hands with Telematics'); - }); - it('should return a null deck when elements are present but are empty.', async () => { - const body = ` -
-

-

Foo

-
- `; - const result = await rule(body); - expect(result.extracted.deck).to.equal(null); - }); - it('should remove the deck elements when present.', async () => { - const body = ` -
-

Put Drivers in - Safe Hands with Telematics

-

Foo

-
- `; - const result = await rule(body); - expect(result.html.cleaned).to.equal('

Foo

'); - }); - it('should extract an author name when present.', async () => { - const body = ` -
- -
- `; - const result = await rule(body); - expect(result.extracted.author.name).to.equal('Jenny Shiner'); - }); - it('should extract an author image when present.', async () => { - const body = ` -
-

-
- `; - const result = await rule(body); - expect(result.extracted.author.image).to.equal('//aemstatic-ww2.azureedge.net/content/dam/up/print-articles/volume-23/issue-2/1902UPpf2-a01.jpg'); - }); - it('should extract an author bio when present.', async () => { - const body = ` -
-

The Author:

-

-

Jenny Shiner is the communications manager for GPS Insight. She graduated from Arizona State University with a bachelor’s degree in communication and is responsible for communication for all business segments that GPS Insight targets. For more information on telematics and fuel card technologies, visit www.gpsinsight.com.

-
- `; - const result = await rule(body); - expect(result.extracted.author.bio).to.equal('

The Author:

Jenny Shiner is the communications manager for GPS Insight. She graduated from Arizona State University with a bachelor’s degree in communication and is responsible for communication for all business segments that GPS Insight targets. For more information on telematics and fuel card technologies, visit www.gpsinsight.com.

'); - }); - it('should remove the author elements when present.', async () => { - const body = ` -
- -

Foo

-

The Author:

-

-

Jenny Shiner is the communications manager for GPS Insight. She graduated from Arizona State University with a bachelor’s degree in communication and is responsible for communication for all business segments that GPS Insight targets. For more information on telematics and fuel card technologies, visit www.gpsinsight.com.

-

Bar

-
- `; - const result = await rule(body); - expect(result.html.cleaned).to.equal('

Foo

Bar

'); - }); + // it('should extract a deck value when present.', async () => { + // const body = ` + //
+ //

Put Drivers in + // Safe Hands with Telematics

+ //

Foo

+ //
+ // `; + // const result = await rule(body); + // expect(result.extracted.deck).to.equal('Put Drivers in Safe Hands with Telematics'); + // }); + // it('should return a null deck when elements are present but are empty.', async () => { + // const body = ` + //
+ //

+ //

Foo

+ //
+ // `; + // const result = await rule(body); + // expect(result.extracted.deck).to.equal(null); + // }); + // it('should remove the deck elements when present.', async () => { + // const body = ` + //
+ //

Put Drivers in + // Safe Hands with Telematics

+ //

Foo

+ //
+ // `; + // const result = await rule(body); + // expect(result.html.cleaned).to.equal('

Foo

'); + // }); + // it('should extract an author name when present.', async () => { + // const body = ` + //
+ // + //
+ // `; + // const result = await rule(body); + // expect(result.extracted.author.name).to.equal('Jenny Shiner'); + // }); + // it('should extract an author image when present.', async () => { + // const body = ` + //
+ //

+ //
+ // `; + // const result = await rule(body); + // expect(result.extracted.author.image).to.equal('//aemstatic-ww2.azureedge.net/content/dam/up/print-articles/volume-23/issue-2/1902UPpf2-a01.jpg'); + // }); + // it('should extract an author bio when present.', async () => { + // const body = ` + //
+ //

The Author:

+ //

+ //

Jenny Shiner is the communications manager for GPS Insight. She graduated from Arizona State University with a bachelor’s degree in communication and is responsible for communication for all business segments that GPS Insight targets. For more information on telematics and fuel card technologies, visit www.gpsinsight.com.

+ //
+ // `; + // const result = await rule(body); + // expect(result.extracted.author.bio).to.equal('

The Author:

Jenny Shiner is the communications manager for GPS Insight. She graduated from Arizona State University with a bachelor’s degree in communication and is responsible for communication for all business segments that GPS Insight targets. For more information on telematics and fuel card technologies, visit www.gpsinsight.com.

'); + // }); + // it('should remove the author elements when present.', async () => { + // const body = ` + //
+ // + //

Foo

+ //

The Author:

+ //

+ //

Jenny Shiner is the communications manager for GPS Insight. She graduated from Arizona State University with a bachelor’s degree in communication and is responsible for communication for all business segments that GPS Insight targets. For more information on telematics and fuel card technologies, visit www.gpsinsight.com.

+ //

Bar

+ //
+ // `; + // const result = await rule(body); + // expect(result.html.cleaned).to.equal('

Foo

Bar

'); + // }); }); From 017c3c4e04ac31a6ee9ae18ec59a792387055509 Mon Sep 17 00:00:00 2001 From: Jacob Bare Date: Wed, 10 Apr 2019 12:16:30 -0500 Subject: [PATCH 3/3] Update rule readme --- src/rules/pennwell/README.md | 43 +++--------------------------------- 1 file changed, 3 insertions(+), 40 deletions(-) diff --git a/src/rules/pennwell/README.md b/src/rules/pennwell/README.md index d2668b2..ed752ac 100644 --- a/src/rules/pennwell/README.md +++ b/src/rules/pennwell/README.md @@ -2,45 +2,8 @@ ## Default To access, send a `POST` request to `/pennwell/default`. This rule set performs the following operations: -- Removes duplicative whitespace values (via `html.replace(/\s\s+/g, '')`) -- Extracts the `deck` text from elements classed with `.paraStyle_headline_deck` and removes the element from the cleaned HTML. -- Extracts an `author` object from elements classed with `.paraStyle_byline` or `.paraStyle_body_bio` and removes the elements from the cleaned HTML. -- If an `

` is detected anywhere in the body, all heading elements are increased by one (e.g. `

` becomes `

`, `

` becomes `

`, etc). +- Removes duplicative whitespace values +- If an `

` or `

` is detected anywhere in the body, all heading elements are increased by two (e.g. `

` becomes `

`, `

` becomes `

`, etc). - Removes all `
` and `