Commit b24c804d authored by nanahira's avatar nanahira

ocr

parent 51c6ce2c
This diff is collapsed.
......@@ -57,11 +57,15 @@ const chineseCharacterWordList = [
{ character: '', value: 0 },
{ character: '', value: 0 },
{ character: '', value: 0 },
{ character: 'O', value: 0 },
{ character: 'o', value: 0 },
{ character: '', value: 1 },
{ character: '', value: 1 },
{ character: '', value: 1 },
{ character: '', value: 1 },
{ character: '', value: 1 },
{ character: 'i', value: 1 },
{ character: 'I', value: 1 },
{ character: '', value: 2 },
{ character: '', value: 2 },
{ character: '', value: 2 },
......@@ -74,6 +78,7 @@ const chineseCharacterWordList = [
{ character: '', value: 5 },
{ character: '', value: 5 },
{ character: '', value: 5 },
{ character: 'S', value: 5 },
{ character: '', value: 6 },
{ character: '', value: 6 },
{ character: '', value: 6 },
......@@ -90,6 +95,8 @@ const chineseCharacterWordList = [
{ character: '', value: 8 },
{ character: '', value: 8 },
{ character: '', value: 8 },
{ character: 'B', value: 8 },
{ character: 'b', value: 8 },
{ character: '', value: 9 },
{ character: '', value: 9 },
{ character: '', value: 9 },
......@@ -101,6 +108,9 @@ const chineseCharacterWordList = [
{ character: '', value: '' },
{ character: '', value: '' },
{ character: '', value: '' },
{ character: '', value: ':' },
{ character: ';', value: ':' },
{ character: '', value: ':' },
].map((l) => ({ character: l.character, value: l.value.toString() }));
const chineseCharacterWordGroup: Record<
string,
......
import 'source-map-support/register';
import { DefineSchema, RegisterSchema } from 'koishi-thirdeye';
import ip from 'ip';
import Tesseract, { createWorker } from 'tesseract.js';
import { Logger } from 'koishi';
import path from 'path';
@RegisterSchema()
export class HisoutensokuJammerPluginConfig {
......@@ -10,6 +13,47 @@ export class HisoutensokuJammerPluginConfig {
@DefineSchema({ desc: 'IP 白名单', default: [], type: 'string' })
addressWhitelist: string[];
@DefineSchema({ desc: '开启文字识别', default: false })
ocr: boolean;
@DefineSchema({ desc: '模型语言', default: 'eng' })
ocrLanguage: string;
@DefineSchema({ type: 'object', allowUnknown: true })
ocrExtraOptions: Partial<Tesseract.WorkerOptions>;
@DefineSchema({ type: 'object', allowUnknown: true })
ocrExtraParameters: Partial<Tesseract.WorkerParams>;
async loadOcr() {
if (!this.ocr) return;
const logger = new Logger('jammer-ocr');
if (this.ocrExtraOptions?.dataPath) {
logger.info(
`Will load esseract.js data files from ${this.ocrExtraOptions?.dataPath}`,
);
}
try {
const worker = createWorker({
logger: (m) => logger.debug(m),
//dataPath: path.join(__dirname, '..', 'lang-data'),
...(this.ocrExtraOptions || {}),
});
await worker.load();
await worker.loadLanguage(this.ocrLanguage);
await worker.initialize(this.ocrLanguage);
await worker.setParameters({
tessedit_char_whitelist: '0123456789.: _-+',
...(this.ocrExtraParameters || {}),
});
logger.info(`OCR worker loaded.`);
return worker;
} catch (e) {
logger.error(`Failed to load OCR worker: ${e.toString()}`);
return;
}
}
getSubnet(addr: string) {
return ip.cidrSubnet(addr.includes('/') ? addr : `${addr}/32`);
}
......
......@@ -15,6 +15,7 @@ import {
import { Attacker } from './attacker';
import moment from 'moment';
import { chineseCharacterList } from './chinese-replace';
import { createWorker, Worker } from 'tesseract.js';
export * from './config';
declare module 'koishi' {
......@@ -31,6 +32,8 @@ declare module 'koishi' {
const matcherGlobal = /([1-2]? *\d? *\d *)(([^\d][1-2]?\d{1,2}){3}).+?([1-6] *\d *\d *\d *\d)/g;
const matcherSingle = /([1-2]? *\d? *\d *)(([^\d][1-2]?\d{1,2}){3}).+?([1-6] *\d *\d *\d *\d)/;
const PROTOCOL_BASE64 = 'base64://';
@KoishiPlugin({
name: 'hisoutensoku-jammer',
schema: HisoutensokuJammerPluginConfig,
......@@ -55,28 +58,78 @@ export default class HisoutensokuJammerPlugin {
@InjectConfig()
private config: HisoutensokuJammerPluginConfig;
ocrWorker: Worker;
@UseEvent('connect')
async loadWorkers() {
this.ocrWorker = await this.config.loadOcr();
}
@UseMiddleware()
async onMessage(session: Session, next: NextFunction) {
this.handleMessage(session.content, session.userId).then();
this.parseMessage(session.content, session.userId).then();
return next();
}
private combineMessage(message: string) {
private parseMessage(message: string, sender: string) {
const segmentChain = segment.parse(message);
const textSegments = segmentChain.filter(
(segment) => segment.type === 'text',
);
return textSegments
const textMessage = textSegments
.map((segment) => segment.data.content)
.join('')
.trim();
if (!this.ocrWorker) {
return this.handleMessage(textMessage, sender);
}
const imageMessageUrls = segmentChain
.filter((segment) => segment.type === 'image' && segment.data?.url)
.map((segment) => segment.data.url);
return Promise.all([
this.handleMessage(textMessage, sender),
this.workForOcr(imageMessageUrls, sender),
]);
}
private async download(url: string) {
if (url.startsWith(PROTOCOL_BASE64)) {
return Buffer.from(url.slice(PROTOCOL_BASE64.length), 'base64');
}
const data = await this.ctx.http.get.arraybuffer(url);
return Buffer.from(data);
}
async handleMessage(message: string, sender: string) {
let receivedMessage = this.combineMessage(message)
.split('\n')
.join(' ')
.toLowerCase();
private async recognize(imageUrl: string) {
try {
const image = await this.download(imageUrl);
const result = await this.ocrWorker.recognize(image);
if (!result?.data?.text?.length) {
this.log.warn(`Recognition of ${imageUrl} failed.`);
return '';
}
const text = result.data.text;
this.log.info(`Recognition of ${imageUrl}: ${text}`);
return text;
} catch (e) {
this.log.warn(`Errored to recognize ${imageUrl}: ${e.toString()}`);
return '';
}
}
private async workForOcr(imageUrls: string[], sender: string) {
const text = (
await Promise.all(imageUrls.map((imageUrl) => this.recognize(imageUrl)))
).join('');
return this.handleMessage(text, sender, false);
}
private async handleMessage(
message: string,
sender: string,
useCache = true,
) {
let receivedMessage = message.split('\n').join(' ').toLowerCase();
for (const chineseCharacter of chineseCharacterList) {
receivedMessage = receivedMessage.replace(
chineseCharacter.characterRegExp,
......@@ -84,19 +137,23 @@ export default class HisoutensokuJammerPlugin {
);
}
const lastMessage = await this.cache.get('lastMessages', sender);
let messageMatch = receivedMessage.match(matcherGlobal);
if (lastMessage) {
receivedMessage = `${lastMessage} ${receivedMessage}`;
if (useCache) {
const lastMessage = await this.cache.get('lastMessages', sender);
if (lastMessage) {
receivedMessage = `${lastMessage} ${receivedMessage}`;
if (!messageMatch) {
messageMatch = receivedMessage.match(matcherGlobal);
}
}
if (!messageMatch) {
messageMatch = receivedMessage.match(matcherGlobal);
await this.cache.set('lastMessages', sender, receivedMessage);
return;
}
}
if (!messageMatch) {
await this.cache.set('lastMessages', sender, receivedMessage);
await this.cache.del('lastMessages', sender);
} else if (!messageMatch) {
return;
}
await this.cache.del('lastMessages', sender);
const attackPromises = messageMatch.map((pattern) => {
const patternMatch = pattern.match(matcherSingle);
const firstDigit = patternMatch[1].replace(/ +/g, '');
......@@ -110,7 +167,7 @@ export default class HisoutensokuJammerPlugin {
const results: boolean[] = await Promise.all(attackPromises);
}
async startAttack(address: string, port: number): Promise<boolean> {
private async startAttack(address: string, port: number): Promise<boolean> {
if (this.config.isWhitelisted(address)) {
this.log.info(`Attack of ${address}:${port} skipped.`);
return false;
......
......@@ -7,7 +7,8 @@
"emitDecoratorMetadata": true,
"experimentalDecorators": true,
"declaration": true,
"sourceMap": true
"sourceMap": true,
"skipLibCheck": true
},
"compileOnSave": true,
"allowJs": true,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment