Commit b24c804d authored by nanahira's avatar nanahira

ocr

parent 51c6ce2c
This diff is collapsed.
...@@ -31,7 +31,8 @@ ...@@ -31,7 +31,8 @@
"koishi-thirdeye": "^3.0.5", "koishi-thirdeye": "^3.0.5",
"lodash": "^4.17.21", "lodash": "^4.17.21",
"moment": "^2.29.1", "moment": "^2.29.1",
"source-map-support": "^0.5.21" "source-map-support": "^0.5.21",
"tesseract.js": "^2.1.5"
}, },
"devDependencies": { "devDependencies": {
"@types/ip": "^1.1.0", "@types/ip": "^1.1.0",
......
...@@ -57,11 +57,15 @@ const chineseCharacterWordList = [ ...@@ -57,11 +57,15 @@ const chineseCharacterWordList = [
{ character: '', value: 0 }, { character: '', value: 0 },
{ character: '', value: 0 }, { character: '', value: 0 },
{ character: '', value: 0 }, { character: '', value: 0 },
{ character: 'O', value: 0 },
{ character: 'o', value: 0 },
{ character: '', value: 1 }, { character: '', value: 1 },
{ character: '', value: 1 }, { character: '', value: 1 },
{ character: '', value: 1 }, { character: '', value: 1 },
{ character: '', value: 1 }, { character: '', value: 1 },
{ character: '', value: 1 }, { character: '', value: 1 },
{ character: 'i', value: 1 },
{ character: 'I', value: 1 },
{ character: '', value: 2 }, { character: '', value: 2 },
{ character: '', value: 2 }, { character: '', value: 2 },
{ character: '', value: 2 }, { character: '', value: 2 },
...@@ -74,6 +78,7 @@ const chineseCharacterWordList = [ ...@@ -74,6 +78,7 @@ const chineseCharacterWordList = [
{ character: '', value: 5 }, { character: '', value: 5 },
{ character: '', value: 5 }, { character: '', value: 5 },
{ character: '', value: 5 }, { character: '', value: 5 },
{ character: 'S', value: 5 },
{ character: '', value: 6 }, { character: '', value: 6 },
{ character: '', value: 6 }, { character: '', value: 6 },
{ character: '', value: 6 }, { character: '', value: 6 },
...@@ -90,6 +95,8 @@ const chineseCharacterWordList = [ ...@@ -90,6 +95,8 @@ const chineseCharacterWordList = [
{ character: '', value: 8 }, { character: '', value: 8 },
{ character: '', value: 8 }, { character: '', value: 8 },
{ character: '', value: 8 }, { character: '', value: 8 },
{ character: 'B', value: 8 },
{ character: 'b', value: 8 },
{ character: '', value: 9 }, { character: '', value: 9 },
{ character: '', value: 9 }, { character: '', value: 9 },
{ character: '', value: 9 }, { character: '', value: 9 },
...@@ -101,6 +108,9 @@ const chineseCharacterWordList = [ ...@@ -101,6 +108,9 @@ const chineseCharacterWordList = [
{ character: '', value: '' }, { character: '', value: '' },
{ character: '', value: '' }, { character: '', value: '' },
{ character: '', value: '' }, { character: '', value: '' },
{ character: '', value: ':' },
{ character: ';', value: ':' },
{ character: '', value: ':' },
].map((l) => ({ character: l.character, value: l.value.toString() })); ].map((l) => ({ character: l.character, value: l.value.toString() }));
const chineseCharacterWordGroup: Record< const chineseCharacterWordGroup: Record<
string, string,
......
import 'source-map-support/register'; import 'source-map-support/register';
import { DefineSchema, RegisterSchema } from 'koishi-thirdeye'; import { DefineSchema, RegisterSchema } from 'koishi-thirdeye';
import ip from 'ip'; import ip from 'ip';
import Tesseract, { createWorker } from 'tesseract.js';
import { Logger } from 'koishi';
import path from 'path';
@RegisterSchema() @RegisterSchema()
export class HisoutensokuJammerPluginConfig { export class HisoutensokuJammerPluginConfig {
...@@ -10,6 +13,47 @@ export class HisoutensokuJammerPluginConfig { ...@@ -10,6 +13,47 @@ export class HisoutensokuJammerPluginConfig {
@DefineSchema({ desc: 'IP 白名单', default: [], type: 'string' }) @DefineSchema({ desc: 'IP 白名单', default: [], type: 'string' })
addressWhitelist: string[]; addressWhitelist: string[];
@DefineSchema({ desc: '开启文字识别', default: false })
ocr: boolean;
@DefineSchema({ desc: '模型语言', default: 'eng' })
ocrLanguage: string;
@DefineSchema({ type: 'object', allowUnknown: true })
ocrExtraOptions: Partial<Tesseract.WorkerOptions>;
@DefineSchema({ type: 'object', allowUnknown: true })
ocrExtraParameters: Partial<Tesseract.WorkerParams>;
async loadOcr() {
if (!this.ocr) return;
const logger = new Logger('jammer-ocr');
if (this.ocrExtraOptions?.dataPath) {
logger.info(
`Will load esseract.js data files from ${this.ocrExtraOptions?.dataPath}`,
);
}
try {
const worker = createWorker({
logger: (m) => logger.debug(m),
//dataPath: path.join(__dirname, '..', 'lang-data'),
...(this.ocrExtraOptions || {}),
});
await worker.load();
await worker.loadLanguage(this.ocrLanguage);
await worker.initialize(this.ocrLanguage);
await worker.setParameters({
tessedit_char_whitelist: '0123456789.: _-+',
...(this.ocrExtraParameters || {}),
});
logger.info(`OCR worker loaded.`);
return worker;
} catch (e) {
logger.error(`Failed to load OCR worker: ${e.toString()}`);
return;
}
}
getSubnet(addr: string) { getSubnet(addr: string) {
return ip.cidrSubnet(addr.includes('/') ? addr : `${addr}/32`); return ip.cidrSubnet(addr.includes('/') ? addr : `${addr}/32`);
} }
......
...@@ -15,6 +15,7 @@ import { ...@@ -15,6 +15,7 @@ import {
import { Attacker } from './attacker'; import { Attacker } from './attacker';
import moment from 'moment'; import moment from 'moment';
import { chineseCharacterList } from './chinese-replace'; import { chineseCharacterList } from './chinese-replace';
import { createWorker, Worker } from 'tesseract.js';
export * from './config'; export * from './config';
declare module 'koishi' { declare module 'koishi' {
...@@ -31,6 +32,8 @@ declare module 'koishi' { ...@@ -31,6 +32,8 @@ declare module 'koishi' {
const matcherGlobal = /([1-2]? *\d? *\d *)(([^\d][1-2]?\d{1,2}){3}).+?([1-6] *\d *\d *\d *\d)/g; const matcherGlobal = /([1-2]? *\d? *\d *)(([^\d][1-2]?\d{1,2}){3}).+?([1-6] *\d *\d *\d *\d)/g;
const matcherSingle = /([1-2]? *\d? *\d *)(([^\d][1-2]?\d{1,2}){3}).+?([1-6] *\d *\d *\d *\d)/; const matcherSingle = /([1-2]? *\d? *\d *)(([^\d][1-2]?\d{1,2}){3}).+?([1-6] *\d *\d *\d *\d)/;
const PROTOCOL_BASE64 = 'base64://';
@KoishiPlugin({ @KoishiPlugin({
name: 'hisoutensoku-jammer', name: 'hisoutensoku-jammer',
schema: HisoutensokuJammerPluginConfig, schema: HisoutensokuJammerPluginConfig,
...@@ -55,28 +58,78 @@ export default class HisoutensokuJammerPlugin { ...@@ -55,28 +58,78 @@ export default class HisoutensokuJammerPlugin {
@InjectConfig() @InjectConfig()
private config: HisoutensokuJammerPluginConfig; private config: HisoutensokuJammerPluginConfig;
ocrWorker: Worker;
@UseEvent('connect')
async loadWorkers() {
this.ocrWorker = await this.config.loadOcr();
}
@UseMiddleware() @UseMiddleware()
async onMessage(session: Session, next: NextFunction) { async onMessage(session: Session, next: NextFunction) {
this.handleMessage(session.content, session.userId).then(); this.parseMessage(session.content, session.userId).then();
return next(); return next();
} }
private combineMessage(message: string) { private parseMessage(message: string, sender: string) {
const segmentChain = segment.parse(message); const segmentChain = segment.parse(message);
const textSegments = segmentChain.filter( const textSegments = segmentChain.filter(
(segment) => segment.type === 'text', (segment) => segment.type === 'text',
); );
return textSegments const textMessage = textSegments
.map((segment) => segment.data.content) .map((segment) => segment.data.content)
.join('') .join('')
.trim(); .trim();
if (!this.ocrWorker) {
return this.handleMessage(textMessage, sender);
}
const imageMessageUrls = segmentChain
.filter((segment) => segment.type === 'image' && segment.data?.url)
.map((segment) => segment.data.url);
return Promise.all([
this.handleMessage(textMessage, sender),
this.workForOcr(imageMessageUrls, sender),
]);
}
private async download(url: string) {
if (url.startsWith(PROTOCOL_BASE64)) {
return Buffer.from(url.slice(PROTOCOL_BASE64.length), 'base64');
}
const data = await this.ctx.http.get.arraybuffer(url);
return Buffer.from(data);
} }
async handleMessage(message: string, sender: string) { private async recognize(imageUrl: string) {
let receivedMessage = this.combineMessage(message) try {
.split('\n') const image = await this.download(imageUrl);
.join(' ') const result = await this.ocrWorker.recognize(image);
.toLowerCase(); if (!result?.data?.text?.length) {
this.log.warn(`Recognition of ${imageUrl} failed.`);
return '';
}
const text = result.data.text;
this.log.info(`Recognition of ${imageUrl}: ${text}`);
return text;
} catch (e) {
this.log.warn(`Errored to recognize ${imageUrl}: ${e.toString()}`);
return '';
}
}
private async workForOcr(imageUrls: string[], sender: string) {
const text = (
await Promise.all(imageUrls.map((imageUrl) => this.recognize(imageUrl)))
).join('');
return this.handleMessage(text, sender, false);
}
private async handleMessage(
message: string,
sender: string,
useCache = true,
) {
let receivedMessage = message.split('\n').join(' ').toLowerCase();
for (const chineseCharacter of chineseCharacterList) { for (const chineseCharacter of chineseCharacterList) {
receivedMessage = receivedMessage.replace( receivedMessage = receivedMessage.replace(
chineseCharacter.characterRegExp, chineseCharacter.characterRegExp,
...@@ -84,19 +137,23 @@ export default class HisoutensokuJammerPlugin { ...@@ -84,19 +137,23 @@ export default class HisoutensokuJammerPlugin {
); );
} }
const lastMessage = await this.cache.get('lastMessages', sender);
let messageMatch = receivedMessage.match(matcherGlobal); let messageMatch = receivedMessage.match(matcherGlobal);
if (lastMessage) { if (useCache) {
receivedMessage = `${lastMessage} ${receivedMessage}`; const lastMessage = await this.cache.get('lastMessages', sender);
if (lastMessage) {
receivedMessage = `${lastMessage} ${receivedMessage}`;
if (!messageMatch) {
messageMatch = receivedMessage.match(matcherGlobal);
}
}
if (!messageMatch) { if (!messageMatch) {
messageMatch = receivedMessage.match(matcherGlobal); await this.cache.set('lastMessages', sender, receivedMessage);
return;
} }
} await this.cache.del('lastMessages', sender);
if (!messageMatch) { } else if (!messageMatch) {
await this.cache.set('lastMessages', sender, receivedMessage);
return; return;
} }
await this.cache.del('lastMessages', sender);
const attackPromises = messageMatch.map((pattern) => { const attackPromises = messageMatch.map((pattern) => {
const patternMatch = pattern.match(matcherSingle); const patternMatch = pattern.match(matcherSingle);
const firstDigit = patternMatch[1].replace(/ +/g, ''); const firstDigit = patternMatch[1].replace(/ +/g, '');
...@@ -110,7 +167,7 @@ export default class HisoutensokuJammerPlugin { ...@@ -110,7 +167,7 @@ export default class HisoutensokuJammerPlugin {
const results: boolean[] = await Promise.all(attackPromises); const results: boolean[] = await Promise.all(attackPromises);
} }
async startAttack(address: string, port: number): Promise<boolean> { private async startAttack(address: string, port: number): Promise<boolean> {
if (this.config.isWhitelisted(address)) { if (this.config.isWhitelisted(address)) {
this.log.info(`Attack of ${address}:${port} skipped.`); this.log.info(`Attack of ${address}:${port} skipped.`);
return false; return false;
......
...@@ -7,7 +7,8 @@ ...@@ -7,7 +7,8 @@
"emitDecoratorMetadata": true, "emitDecoratorMetadata": true,
"experimentalDecorators": true, "experimentalDecorators": true,
"declaration": true, "declaration": true,
"sourceMap": true "sourceMap": true,
"skipLibCheck": true
}, },
"compileOnSave": true, "compileOnSave": true,
"allowJs": true, "allowJs": true,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment