2021-02-06 14:44:46 +02:00
|
|
|
import * as parse5 from 'parse5';
|
|
|
|
import treeAdapter = require('parse5/lib/tree-adapters/default');
|
|
|
|
import { URL } from 'url';
|
2021-04-02 04:36:11 +03:00
|
|
|
|
|
|
|
const urlRegex = /^https?:\/\/[\w\/:%#@$&?!()\[\]~.,=+\-]+/;
|
|
|
|
const urlRegexFull = /^https?:\/\/[\w\/:%#@$&?!()\[\]~.,=+\-]+$/;
|
2018-06-20 19:21:57 +03:00
|
|
|
|
2022-02-04 04:10:53 +02:00
|
|
|
export function fromHtml(html: string, hashtagNames?: string[]): string {
|
2021-02-06 14:44:46 +02:00
|
|
|
const dom = parse5.parseFragment(html);
|
2018-06-20 19:21:57 +03:00
|
|
|
|
|
|
|
let text = '';
|
|
|
|
|
2018-12-11 13:36:55 +02:00
|
|
|
for (const n of dom.childNodes) {
|
|
|
|
analyze(n);
|
|
|
|
}
|
2018-06-20 19:21:57 +03:00
|
|
|
|
|
|
|
return text.trim();
|
|
|
|
|
2021-02-06 14:44:46 +02:00
|
|
|
function getText(node: parse5.Node): string {
|
|
|
|
if (treeAdapter.isTextNode(node)) return node.value;
|
|
|
|
if (!treeAdapter.isElementNode(node)) return '';
|
2021-09-25 19:57:38 +03:00
|
|
|
if (node.nodeName === 'br') return '\n';
|
2018-06-20 19:21:57 +03:00
|
|
|
|
|
|
|
if (node.childNodes) {
|
2021-02-06 14:44:46 +02:00
|
|
|
return node.childNodes.map(n => getText(n)).join('');
|
2018-06-20 19:21:57 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return '';
|
|
|
|
}
|
|
|
|
|
2021-09-25 19:57:38 +03:00
|
|
|
function appendChildren(childNodes: parse5.ChildNode[]): void {
|
|
|
|
if (childNodes) {
|
|
|
|
for (const n of childNodes) {
|
|
|
|
analyze(n);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-02-06 14:44:46 +02:00
|
|
|
function analyze(node: parse5.Node) {
|
|
|
|
if (treeAdapter.isTextNode(node)) {
|
|
|
|
text += node.value;
|
|
|
|
return;
|
|
|
|
}
|
2018-06-20 19:21:57 +03:00
|
|
|
|
2021-02-06 14:44:46 +02:00
|
|
|
// Skip comment or document type node
|
|
|
|
if (!treeAdapter.isElementNode(node)) return;
|
|
|
|
|
|
|
|
switch (node.nodeName) {
|
2021-11-12 03:52:10 +02:00
|
|
|
case 'br': {
|
2018-06-20 19:21:57 +03:00
|
|
|
text += '\n';
|
|
|
|
break;
|
2021-11-12 03:52:10 +02:00
|
|
|
}
|
2018-06-20 19:21:57 +03:00
|
|
|
|
|
|
|
case 'a':
|
2021-09-25 19:57:38 +03:00
|
|
|
{
|
2018-06-20 19:21:57 +03:00
|
|
|
const txt = getText(node);
|
2021-02-06 14:44:46 +02:00
|
|
|
const rel = node.attrs.find(x => x.name === 'rel');
|
|
|
|
const href = node.attrs.find(x => x.name === 'href');
|
2018-06-20 19:21:57 +03:00
|
|
|
|
2020-04-03 16:51:38 +03:00
|
|
|
// ハッシュタグ
|
|
|
|
if (hashtagNames && href && hashtagNames.map(x => x.toLowerCase()).includes(txt.toLowerCase())) {
|
|
|
|
text += txt;
|
2018-06-20 19:21:57 +03:00
|
|
|
// メンション
|
2018-12-12 04:47:07 +02:00
|
|
|
} else if (txt.startsWith('@') && !(rel && rel.value.match(/^me /))) {
|
2018-06-20 19:21:57 +03:00
|
|
|
const part = txt.split('@');
|
|
|
|
|
2021-02-06 14:44:46 +02:00
|
|
|
if (part.length === 2 && href) {
|
2018-06-20 19:21:57 +03:00
|
|
|
//#region ホスト名部分が省略されているので復元する
|
2018-09-01 17:12:51 +03:00
|
|
|
const acct = `${txt}@${(new URL(href.value)).hostname}`;
|
2018-06-20 19:21:57 +03:00
|
|
|
text += acct;
|
|
|
|
//#endregion
|
2020-04-04 02:46:54 +03:00
|
|
|
} else if (part.length === 3) {
|
2018-06-20 19:21:57 +03:00
|
|
|
text += txt;
|
|
|
|
}
|
2018-09-01 16:45:27 +03:00
|
|
|
// その他
|
|
|
|
} else {
|
2021-02-06 14:44:46 +02:00
|
|
|
const generateLink = () => {
|
|
|
|
if (!href && !txt) {
|
|
|
|
return '';
|
|
|
|
}
|
|
|
|
if (!href) {
|
|
|
|
return txt;
|
|
|
|
}
|
|
|
|
if (!txt || txt === href.value) { // #6383: Missing text node
|
|
|
|
if (href.value.match(urlRegexFull)) {
|
|
|
|
return href.value;
|
|
|
|
} else {
|
|
|
|
return `<${href.value}>`;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (href.value.match(urlRegex) && !href.value.match(urlRegexFull)) {
|
|
|
|
return `[${txt}](<${href.value}>)`; // #6846
|
|
|
|
} else {
|
|
|
|
return `[${txt}](${href.value})`;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
text += generateLink();
|
2018-06-20 19:21:57 +03:00
|
|
|
}
|
|
|
|
break;
|
2021-09-25 19:57:38 +03:00
|
|
|
}
|
2018-06-20 19:21:57 +03:00
|
|
|
|
2021-09-25 19:57:38 +03:00
|
|
|
case 'h1':
|
|
|
|
{
|
|
|
|
text += '【';
|
|
|
|
appendChildren(node.childNodes);
|
|
|
|
text += '】\n';
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case 'b':
|
|
|
|
case 'strong':
|
|
|
|
{
|
|
|
|
text += '**';
|
|
|
|
appendChildren(node.childNodes);
|
|
|
|
text += '**';
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case 'small':
|
|
|
|
{
|
|
|
|
text += '<small>';
|
|
|
|
appendChildren(node.childNodes);
|
|
|
|
text += '</small>';
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case 's':
|
|
|
|
case 'del':
|
|
|
|
{
|
|
|
|
text += '~~';
|
|
|
|
appendChildren(node.childNodes);
|
|
|
|
text += '~~';
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case 'i':
|
|
|
|
case 'em':
|
|
|
|
{
|
|
|
|
text += '<i>';
|
|
|
|
appendChildren(node.childNodes);
|
|
|
|
text += '</i>';
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// block code (<pre><code>)
|
|
|
|
case 'pre': {
|
|
|
|
if (node.childNodes.length === 1 && node.childNodes[0].nodeName === 'code') {
|
2021-11-05 04:54:26 +02:00
|
|
|
text += '\n```\n';
|
2021-09-25 19:57:38 +03:00
|
|
|
text += getText(node.childNodes[0]);
|
|
|
|
text += '\n```\n';
|
|
|
|
} else {
|
|
|
|
appendChildren(node.childNodes);
|
2018-06-20 19:21:57 +03:00
|
|
|
}
|
|
|
|
break;
|
2021-09-25 19:57:38 +03:00
|
|
|
}
|
2018-06-20 19:21:57 +03:00
|
|
|
|
2021-09-25 19:57:38 +03:00
|
|
|
// inline code (<code>)
|
|
|
|
case 'code': {
|
|
|
|
text += '`';
|
|
|
|
appendChildren(node.childNodes);
|
|
|
|
text += '`';
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case 'blockquote': {
|
|
|
|
const t = getText(node);
|
|
|
|
if (t) {
|
2021-12-28 21:15:28 +02:00
|
|
|
text += '\n> ';
|
2021-09-25 19:57:38 +03:00
|
|
|
text += t.split('\n').join(`\n> `);
|
2018-06-20 19:21:57 +03:00
|
|
|
}
|
|
|
|
break;
|
2021-09-25 19:57:38 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
case 'p':
|
|
|
|
case 'h2':
|
|
|
|
case 'h3':
|
|
|
|
case 'h4':
|
|
|
|
case 'h5':
|
|
|
|
case 'h6':
|
|
|
|
{
|
|
|
|
text += '\n\n';
|
|
|
|
appendChildren(node.childNodes);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// other block elements
|
|
|
|
case 'div':
|
|
|
|
case 'header':
|
|
|
|
case 'footer':
|
|
|
|
case 'article':
|
|
|
|
case 'li':
|
|
|
|
case 'dt':
|
|
|
|
case 'dd':
|
|
|
|
{
|
|
|
|
text += '\n';
|
|
|
|
appendChildren(node.childNodes);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
default: // includes inline elements
|
|
|
|
{
|
|
|
|
appendChildren(node.childNodes);
|
|
|
|
break;
|
|
|
|
}
|
2018-06-20 19:21:57 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|