Web POC

node v16.15.0
version: 5.0.0
endpointsharetweet
const urls = ["https://philadelphiabar.org/page/EthicsOpinions","https://philadelphiabar.org/page/BoardAgendas","https://philadelphiabar.org/page/SectionChairs","https://philadelphiabar.org/page/HelpfulWebSites","https://philadelphiabar.org/page/BoardResolutions2013","https://www.philadelphiabar.org/page/ProbateAndTrustLawSection"];
const axios = require("axios"); const cheerio = require("cheerio"); const Promise = require("bluebird"); var URL = require('url'); function SplitCamelCaseWithAbbreviations(s){ return s.split(/([A-Z][a-z0-9]+)/).filter(function(e){return e}).join(' '); } function upperCaseFirstLetter(s){ return s.charAt(0).toUpperCase() + s.slice(1); } async function parsePage(url) { const html = await axios.get(url); const $ = await cheerio.load(html.data); const urlInfo = URL.parse(url, true); const pageTitle = ($("div.main-content .defaultText h1").first().text() || "").trim(); const lastPathSegment = urlInfo.path.split('/').pop(); const pageTitleFromURL = SplitCamelCaseWithAbbreviations(upperCaseFirstLetter(lastPathSegment)); const pageTitleToUse = ((pageTitle.length) ? pageTitle : pageTitleFromURL); const publicGroupUID = 'b3d7ce68-11dc-4e12-a43f-e6f23892b482' //remove page title from content $("div.main-content .defaultText h1").first().remove(); $('.page-subnav').remove() let mainContent = ($("div.main-content .defaultText").html() || "").trim(); let sidebarContent = ($("div.second-sidebar").html() || "").trim(); let subsection = ($('div.first-sidebar .subnav a.selected').html() || "").trim(); let mainsection = ($('div.mainnav #navbar a.selected').html() || "").trim(); let results = { url: JSON.parse(JSON.stringify(urlInfo.href)), //convert url to generic object pagename: lastPathSegment, section: [mainsection,subsection].filter(Boolean).join('/'), SiteResourceType: "UserCreatedPage", Status: "Active", QuickLink: urlInfo.path, InheritPlacements: 1, ModeOverride: "", TemplateOverride: "", ApplyNoIndex: "no", ApplyNoFollow: "no", ApplyNoArchive: "no", languages : [{ "languageCode": "en", "PageTitle": pageTitleToUse, "Keywords": "", "Description": "" }], PageZones:[], } results.PageZones.push({ zoneName: "main", content: [{ isHTML: 1, uid: "", permissions: [{ functionname: "view", allowed: "1", groupUIDs:[publicGroupUID] }], languages : [{ languageCode: "en", contentTitle: pageTitleToUse, description: "", rawContent: mainContent }], }] }); if (pageTitleToUse.length) { results.PageZones.push({ zoneName: "M", content: [{ isHTML: 1, uid: "", permissions: [{ functionname: "view", allowed: "1", groupUIDs:[publicGroupUID] }], languages : [{ languageCode: "en", contentTitle: `${pageTitleToUse} Header`, description: "", rawContent: `<ul><li><h2>${pageTitleToUse}</h2></li></ul>` }], }] }) } if (sidebarContent.length) { results.PageZones.push({ zoneName: "N", content: [{ isHTML: 1, uid: "", permissions: [{ functionname: "view", allowed: "1", groupUIDs:[publicGroupUID] }], languages : [{ languageCode: "en", contentTitle: `${pageTitleToUse} Sidebar`, description: "", rawContent: sidebarContent }], }] }) } return results; } let results= await Promise.map(urls, async (item) => parsePage(item)); results.forEach(element => console.log(JSON.stringify(element)))
Loading…

no comments

    sign in to comment