From 7d680c47cd26db139914e61ebbb2b3ce05eea665 Mon Sep 17 00:00:00 2001 From: dragongoose <19649813+dragongoose@users.noreply.github.com> Date: Tue, 14 Mar 2023 17:17:17 -0400 Subject: [PATCH] Switch to much more lightweight and faster scraping method --- server/package.json | 4 - server/routes/profileRoute.ts | 6 +- server/types/scraping/Streamer.ts | 24 +-- server/util/scraping/extractor/index.ts | 189 ++++++++++++++++++++++ server/util/scraping/extractors.ts | 203 ------------------------ 5 files changed, 206 insertions(+), 220 deletions(-) create mode 100644 server/util/scraping/extractor/index.ts delete mode 100644 server/util/scraping/extractors.ts diff --git a/server/package.json b/server/package.json index bdf1065..a1e6fc1 100644 --- a/server/package.json +++ b/server/package.json @@ -1,12 +1,8 @@ { "dependencies": { - "@dragongoose/streamlink": "^1.1.1", "connect-history-api-fallback": "^2.0.0", "dotenv": "^16.0.3", "express": "^4.18.2", - "puppeteer": "^19.7.2", - "puppeteer-extra": "^3.3.6", - "puppeteer-extra-plugin-adblocker": "^2.13.6", "winston": "^3.8.2", "ws": "^8.13.0" }, diff --git a/server/routes/profileRoute.ts b/server/routes/profileRoute.ts index 999a107..cfee2ee 100644 --- a/server/routes/profileRoute.ts +++ b/server/routes/profileRoute.ts @@ -1,13 +1,13 @@ import { Router } from 'express' -import { TwitchScraper } from '../util/scraping/extractors' +import { TwitchAPI } from '../util/scraping/extractor/index' const profileRouter = Router() -const scraper = new TwitchScraper() +const twitch = new TwitchAPI() profileRouter.get('/users/:username', async (req, res, next) => { const username = req.params.username - let streamerData = await scraper.getStreamerData(username) + let streamerData = await twitch.getStreamerInfo(username) .catch(next) if (streamerData) diff --git a/server/types/scraping/Streamer.ts b/server/types/scraping/Streamer.ts index 35148ed..affa30a 100644 --- a/server/types/scraping/Streamer.ts +++ b/server/types/scraping/Streamer.ts @@ -1,4 +1,4 @@ -export interface Socials { +export interface Social { type: string | null text: string, link: string @@ -9,16 +9,20 @@ export interface StreamData { title: string topic: string startedAt: number - qualities: string[] + viewers: number + preview: string } export interface StreamerData { - username: string, - followers: number, - followersAbbv: string, - isLive: boolean, - about: string, - socials?: string[], - pfp: string; - stream?: StreamData + username: string + followers: number + followersAbbv: string + isLive: boolean + about: string + socials?: Social[] + pfp: string + stream: StreamData | null + isPartner: boolean + colorHex: string + id: number } \ No newline at end of file diff --git a/server/util/scraping/extractor/index.ts b/server/util/scraping/extractor/index.ts new file mode 100644 index 0000000..7a51390 --- /dev/null +++ b/server/util/scraping/extractor/index.ts @@ -0,0 +1,189 @@ +import { LooseObject } from "../../../types/looseTypes" +import { StreamerData, StreamData, Social } from "../../../types/scraping/Streamer" + +/** + * Class that interacts with the Twitch api + */ +export class TwitchAPI { + public readonly twitchUrl = 'https://gql.twitch.tv/gql' + public headers = { + "Client-Id": "kimne78kx3ncx6brgo4mv6wki5h1ko" + } + + constructor() {} + + /** + * Gets information about a streamer, like socials, about, and more. + * @see StreamerData + * @param streamerName The username of the streamer + * @returns Promise + */ + public getStreamerInfo = async (streamerName: string) => { + const payload = [ + { + "operationName": "ChannelRoot_AboutPanel", + "variables": { + "channelLogin": streamerName, + "skipSchedule": false + }, + "extensions": { + "persistedQuery": { + "version": 1, + "sha256Hash": "6089531acef6c09ece01b440c41978f4c8dc60cb4fa0124c9a9d3f896709b6c6" + } + } + }, + { + "operationName":"StreamMetadata", + "variables":{ + "channelLogin": streamerName + }, + "extensions":{ + "persistedQuery":{ + "version":1, + "sha256Hash":"a647c2a13599e5991e175155f798ca7f1ecddde73f7f341f39009c14dbf59962" + } + } + }, + { + "operationName": "StreamTagsTrackingChannel", + "variables": { + "channel": streamerName + }, + "extensions": { + "persistedQuery": { + "version": 1, + "sha256Hash": "6aa3851aaaf88c320d514eb173563d430b28ed70fdaaf7eeef6ed4b812f48608" + } + } + }, + { + "operationName": "VideoPreviewOverlay", + "variables": { + "login": streamerName + }, + "extensions": { + "persistedQuery": { + "version": 1, + "sha256Hash": "9515480dee68a77e667cb19de634739d33f243572b007e98e67184b1a5d8369f" + } + } + }, + { + "operationName": "UseViewCount", + "variables": { + "channelLogin": streamerName + }, + "extensions": { + "persistedQuery": { + "version": 1, + "sha256Hash": "00b11c9c428f79ae228f30080a06ffd8226a1f068d6f52fbc057cbde66e994c2" + } + } + }, + ] + + const res = await fetch(this.twitchUrl, { + method: 'POST', + body: JSON.stringify(payload), + headers: this.headers + }) + + const data = await res.json() + console.log(data[1].data, data[1].data.user.stream) + + + const rawStreamerData = data[0].data + + + // get socials + const socials: LooseObject[] = [] + if (rawStreamerData.user.channel && rawStreamerData.user.channel.socialMedias) { + for (let social of rawStreamerData.user.channel.socialMedias) { + socials.push({ + type: social.name, + name: social.title, + link: social.url + }) + } + } + + // check if is liver + const rawStreamData = data[1].data.user.stream + let parsedStream: StreamData | null; + if(!rawStreamData) { + parsedStream = null + } else { + const tags: string[] = [] + for (let tagData of data[2].data.user.stream.freeformTags) { + tags.push(tagData.name) + } + + parsedStream = { + title: data[1].data.user.lastBroadcast.title, + topic: rawStreamData.game.name, + startedAt: new Date(rawStreamData.createdAt).valueOf(), + tags, + viewers: Number(data[4].data.user.stream.viewersCount), + preview: data[3].data.user.stream.previewImageURL + } + } + + const abbreviatedFollowers = Intl.NumberFormat('en-US', { + notation: "compact", + maximumFractionDigits: 1 + }).format(rawStreamerData.user.followers.totalCount) + + const streamerData: StreamerData = { + username: rawStreamerData.user.displayName, + about: rawStreamerData.user.description, + pfp: rawStreamerData.user.profileImageURL, + followers: rawStreamerData.user.followers.totalCount, + socials: socials as Social[], + isLive: (!!parsedStream), + isPartner: rawStreamerData.user.isPartner, + followersAbbv: abbreviatedFollowers, + colorHex: '#' + rawStreamerData.user.primaryColorHex, + id: Number(rawStreamerData.user.id), + stream: parsedStream + } + + return Promise.resolve(streamerData) + } + + /** + * Gets the current viewers of a stream + * @param streamerName The username of the streamer + * @returns Promise + */ + public getViewers = async (streamerName: string) => { + const payload = [ + { + "operationName": "UseViewCount", + "variables": { + "channelLogin": streamerName + }, + "extensions": { + "persistedQuery": { + "version": 1, + "sha256Hash": "00b11c9c428f79ae228f30080a06ffd8226a1f068d6f52fbc057cbde66e994c2" + } + } + }, + ] + + const res = await fetch(this.twitchUrl, { + method: 'POST', + body: JSON.stringify(payload), + headers: this.headers + }) + + const rawData = await res.json() + console.log(rawData) + + if(!rawData[0].data.user.stream) + return Promise.reject(new Error(`Streamer ${streamerName} is not live`)) + + return Promise.resolve(rawData[0].data.user.stream.viewersCount) + } +} \ No newline at end of file diff --git a/server/util/scraping/extractors.ts b/server/util/scraping/extractors.ts deleted file mode 100644 index db49052..0000000 --- a/server/util/scraping/extractors.ts +++ /dev/null @@ -1,203 +0,0 @@ -import puppeteer from 'puppeteer-extra' -import { Browser, Page } from 'puppeteer' -import { PuppeteerExtraPluginAdblocker } from 'puppeteer-extra-plugin-adblocker' -import { LooseObject } from '../../types/looseTypes' -import { StreamData, StreamerData, Socials } from '../../types/scraping/Streamer' -import { Streamlink } from '@dragongoose/streamlink' - - -export class TwitchScraper { - public cache: Map = new Map() - - - constructor() { - puppeteer.use(new PuppeteerExtraPluginAdblocker({ - blockTrackersAndAnnoyances: true - })) - } - - private abbreviatedNumberToNumber = (num: string) => { - const base = parseFloat(num) - - const matches: {[k: string]: number} = { - 'k': 1000, - 'm': 1000000, - 'b': 1000000000 - } - - const abbreviation: string = num.charAt(num.length - 1).toLowerCase() - - - if(matches[abbreviation]) { - const numberOnly: number = Number(num.slice(0, -1)) - return numberOnly * matches[abbreviation] - } else { - return null - } - } - - // https:// advancedweb.hu/how-to-speed-up-puppeteer-scraping-with-parallelization/ - private withBrowser = async (fn: Function) => { - const browser = await puppeteer.launch({ - headless: true, - args: ['--no-sandbox'] - }); - try { - return await fn(browser); - } finally { - await browser.close(); - } - } - - private withPage = (browser: Browser) => async (fn: Function) => { - const page = await browser.newPage(); - //await page.tracing.start({ path: '../profile.json', screenshots: true }); - try { - return await fn(page); - } finally { - //await page.tracing.stop(); - await page.close(); - } - } - - private getStreamData = async (page: Page, isLive: boolean) => { - const streamData: LooseObject = {} - - if(!isLive) return null - - // Get stream tags - const tagsSelector = '.eUxEWt * span' - const tags: string[] = await page.$$eval(tagsSelector, elements => elements.map(el => el.innerHTML)) - streamData.tags = tags - - // Get stream title - const titleSelector = 'h2.CoreText-sc-1txzju1-0' - const title: string = await page.$eval(titleSelector, element => element.innerText) - streamData.title = title - - // Get topic - const topicSelector = '.hfMGmo' - const topic = await page.$eval(topicSelector, element => element.textContent) - streamData.topic = topic - - // Get Start time - const liveTimeSelector = '.live-time' - - // formated as HH:MM:SS - const liveTime = await page.$eval(liveTimeSelector, element => element.textContent) - if(!liveTime) return - const liveTimeSplit: number[] = liveTime.split(':').map(Number) - let date = new Date() - let { hours, minutes, seconds } = { hours: date.getHours(), minutes: date.getMinutes(), seconds: date.getSeconds()} - - // Subtracts current live time from current - // date to get the time the stream started - date.setHours(hours - liveTimeSplit[0]) - date.setMinutes(minutes - liveTimeSplit[1]) - date.setSeconds(seconds - liveTimeSplit[2]) - - streamData.startedAt = date.getTime() - - return streamData as StreamData - } - - private getAboutData = async (page: Page, isLive: boolean) => { - const aboutData: LooseObject = {} - - if (!isLive) { - // Get data from about page - const aboutPageButtonSelector = 'li.InjectLayout-sc-1i43xsx-0:nth-child(2) > a:nth-child(1) > div:nth-child(1) > div:nth-child(1) > p:nth-child(1)' - await page.click(aboutPageButtonSelector) - } - await page.waitForSelector('.kuAEke') - - const followersSelector = '.kuAEke' - const followers = await page.$eval(followersSelector, element => element.innerHTML) - aboutData.followersAbbv = followers - aboutData.followers = this.abbreviatedNumberToNumber(followers) - - const aboutSectionSelector = '.kLFSJC' - const aboutSection = await page.$eval(aboutSectionSelector, element => element.innerHTML) - aboutData.about = aboutSection - - const socialSelector = '.ccXeNc * a' - const socials: Socials[] = await page.$$eval(socialSelector, elements => elements.map((el) => { - - const getHostName = (url: string) => { - const match = url.match(/:\/\/(www[0-9]?\.)?(.[^/:]+)/i); - if (match != null && match.length > 2 && typeof match[2] === 'string' && match[2].length > 0) { - const hostname = match[2].split("."); - return hostname[0]; - } - else { - return null; - } - } - - const validHosts = ['instagram', 'youtube', 'discord', 'tiktok','twitter'] - const socialHost = getHostName(el.href) || el.href || '' - let type: string | null = socialHost - if(!validHosts.includes(socialHost)) - type = null - - return { - type, - link: el.href, - text: el.innerText - } - })) - aboutData.socials = socials - - const profilePictureSelector = 'figure.ScAvatar-sc-144b42z-0:nth-child(2) > img:nth-child(1)' - const profilePicutre = await page.$eval(profilePictureSelector, element => element.getAttribute('src')) - aboutData.pfp = profilePicutre - - return aboutData as StreamerData - } - - public getStreamerData = async (username: string): Promise => { - let recoveredData: LooseObject = {} - let isLive = await this.isLive(username) - - await this.withBrowser(async (browser: Browser) => { - const result = await this.withPage(browser)(async (page: Page) => { - const res = await page.goto(`https://twitch.tv/${username}`) - - if(!res?.ok()) { - return null - } else { - return Promise.all([this.getStreamData(page, isLive), this.getAboutData(page, isLive)]) - } - }) - - recoveredData = result[1] - recoveredData.stream = result[0] - if(result[0] !== null) recoveredData.isLive = true - - await browser.close() - }) - - - // add final information - if(recoveredData && recoveredData.stream && isLive) - recoveredData.stream.qualities = await this.getQualities(username) - - if(recoveredData) { - recoveredData.isLive = isLive - } - - recoveredData.username = username - return recoveredData as StreamerData - } - - public isLive = async (username: string) => { - const streamlink = new Streamlink(`https://twitch.tv/${username}`, {}) - return await streamlink.isLive() - } - - public getQualities = async (username: string) => { - const streamlink = new Streamlink(`https://twitch.tv/${username}`, {}) - return await streamlink.getQualities() - } - -}