code for extract info from ass

This commit is contained in:
1c7 2018-06-08 03:39:03 +08:00
parent e24ae168eb
commit cac08500b0
7 changed files with 203 additions and 0 deletions

BIN
.DS_Store vendored

Binary file not shown.

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
extract_from_ass_subtitle/node_modules
extract_from_ass_subtitle/package-lock.json

BIN
extract_from_ass_subtitle/.DS_Store vendored Normal file

Binary file not shown.

View File

@ -0,0 +1,97 @@
const assParser = require('ass-parser');
const fs = require('fs');
const common = require('./common.js');
const path = require('path')
/*
1. 是什么
.ass 字幕文件中提取片头文字共2行字幕
仅针对 [计算机科学速成课] Crash Course Computer Science
2. 如何使用
node 2.\ extract_head.js
3. 备注
所有40集 ass 字幕文件要和这个脚本放到同一个目录下
文件命名必须类似1. 计算机早期历史-Early Computing.ass
1. 开头
.ass 结尾
因为我们要按顺序读取
代码是半夜2点写的目的是快速 hack 出来
所以有些代码不是最高效比如循环很多次就不要吐槽啦管用就好
*/
var result = '' // 存临时结果
var final_result = ''; // 存最终结果
var folder = '/Users/remote_edit/Desktop/ass/'
// 循环当前目录下的 ass 文件,按 1234...一直到 40 这样的顺序
function loop_current_folder_ass_file_in_order(){
fs.readdir(folder, (err, files) => {
for (var i = 1; i <= 40; i++) { // 40*40(files)=1600 loop, I know.. I know.. not the most efficient code, but it's good enough
files.forEach(file => { // file == 1. 计算机早期历史-Early Computing.ass
var number = file.split('.')[0] // number == 1
var ext = path.extname(file); // ext == ass
if (parseInt(number) === i && ext === '.ass'){
extract_main_point(file, number);
}
});
}
})
}
// 抽取重点出来那两行字幕的样式在全部40集中都是统一的叫'备注 - 主题'和'这一集的名字'
function extract_main_point(file, number){
var text = fs.readFileSync(file,'utf8')
data = assParser(text);
body = data[3]['body'];
for (var i = 0; i <= body.length-1; i++) {
var element = body[i];
if (element.key == 'Dialogue') {
if (element.value.Style == '这一集的名字'){
var text = element.value.Text;
text = common.remove_curly_brace_keep_text(text);
// console.log(text);
result = result + '## ' + text + '\n';
}
}
}
for (var i = 0; i <= body.length-1; i++) {
var element = body[i];
if (element.key == 'Dialogue') {
if (element.value.Style == '备注 - 主题'){
var text = element.value.Text;
text = common.remove_curly_brace_keep_text(text);
// console.log(text);
result = result + text.replace(/\\N/g, '\n') + '\n\n\n';
}
}
}
// 如果到了最后一集
if (number == 40){
// 清理两边空格trim),然后末尾加俩空格作为 markdown 换行
var line_array = result.split('\n');
line_array.forEach(l => {
final_result = final_result + l.trim() + ' ' + '\n';
})
// 处理 HTML entity
final_result = common.convertHTML(final_result);
// 写文件
fs.writeFile("CS_40episode_point.md", final_result, function(err) {
if(err) {
return console.log(err);
}
console.log("The file was saved!");
});
}
}
// =============
loop_current_folder_ass_file_in_order();

View File

@ -0,0 +1,65 @@
const assParser = require('ass-parser');
const fs = require('fs');
const common = require('./common.js');
const path = require('path')
var result = ''
var folder = '/Users/remote_edit/Desktop/ass/'
function loop_current_folder_ass_file_in_order(){
fs.readdir(folder, (err, files) => {
for (var i = 1; i <= 40; i++) { // 40*40(files)=1600 loop, I know.. I know.. not the most efficient code, but it's good enough
files.forEach(file => { // file == 1. 计算机早期历史-Early Computing.ass
var number = file.split('.')[0] // number == 1
var ext = path.extname(file); // ext == ass
if (parseInt(number) === i && ext === '.ass'){
extract_main_point(folder+file, number);
// console.log(file);
}
});
}
})
}
function extract_main_point(file, number){
var text = fs.readFileSync(file,'utf8')
data = assParser(text);
body = data[3]['body'];
for (var i = 0; i <= body.length-1; i++) {
var element = body[i];
if (element.key == 'Dialogue') {
if (element.value.Style == 'en - 白色'){
var text = element.value.Text;
text = common.remove_curly_brace_keep_text(text);
result = result + text.trim() + '\n';
}
}
}
for (var i = 0; i <= body.length-1; i++) {
var element = body[i];
if (element.key == 'Dialogue') {
if (element.value.Style == 'zh - 黄色'){
var text = element.value.Text;
text = common.remove_curly_brace_keep_text(text);
result = result + text.trim() + '\n';
}
}
}
// 写文件
fs.writeFile(file+'.txt', result, function(err) {
if(err) {
return console.log(err);
}
console.log("The file was saved!");
});
result = '';
}
// =============
loop_current_folder_ass_file_in_order();

View File

@ -0,0 +1,25 @@
// Remove all {}
// Input: "{\c&HCC9933&}Subtitles by {\c\c&HFFFFFF &}MemoryOnSmells{\c} {\c&HCC9933&}Exclusive for http://UKsubtitles.ru{\c}"
// Output: "Subtitles by MemoryOnSmells Exclusive for http://UKsubtitles.ru"
function remove_curly_brace_keep_text(str) {
return str.replace(/\s*\{.*?\}\s*/g, ' ').trim();
}
function convertHTML(str) {
var entityPairs = [
{character: '&', html: '&amp;'},
{character: '<', html: '&lt;'},
{character: '>', html: '&gt;'},
{character: "'", html: '&apos;'},
{character: '"', html: '&quot;'},
];
entityPairs.forEach(function(pair){
var reg = new RegExp(pair.character, 'g');
str = str.replace(reg, pair.html);
});
return str;
}
exports.remove_curly_brace_keep_text = remove_curly_brace_keep_text
exports.convertHTML = convertHTML

View File

@ -0,0 +1,14 @@
{
"name": "cc_nodejs",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "",
"license": "ISC",
"dependencies": {
"ass-parser": "^0.2.0"
}
}