Skip to content

Commit 0d47e77

Browse files
committed
feat: add model to translate script
1 parent 4490fd7 commit 0d47e77

File tree

7 files changed

+102
-31
lines changed

7 files changed

+102
-31
lines changed

packages/translate/src/chunk.ts

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,22 @@
1+
import type { DeepSeekModel } from './types';
2+
13
// Constants for token estimation
24
export const CHAR_TO_TOKEN_RATIO = 0.3; // 1 English character ≈ 0.3 token
35
export const CHAR_TO_TOKEN_RATIO_ZH = 0.5; // 1 Chinese character ≈ 0.5 token
46
export const MAX_INPUT_TOKENS = 64 * 1024; // DeepSeek's 64K context length
5-
export const MAX_OUTPUT_TOKENS = 8 * 1024; // DeepSeek's 8K max output
7+
8+
// Model-specific output token limits
9+
export const MAX_OUTPUT_TOKENS_CHAT = 8 * 1024; // deepseek-chat: max 8K output
10+
export const MAX_OUTPUT_TOKENS_REASONER = 64 * 1024; // deepseek-reasoner: max 64K output
11+
12+
// Get max output tokens for a specific model
13+
export function getMaxOutputTokens(
14+
model: DeepSeekModel = 'deepseek-chat',
15+
): number {
16+
return model === 'deepseek-reasoner'
17+
? MAX_OUTPUT_TOKENS_REASONER
18+
: MAX_OUTPUT_TOKENS_CHAT;
19+
}
620

721
// Chunk size constants (in estimated tokens)
822
export const MAX_CHUNK_SIZE_TOKENS = 16 * 1024; // Use smaller chunks for better translation quality
@@ -13,12 +27,20 @@ export function estimateTokens(content: string): number {
1327
return Math.ceil(content.length * CHAR_TO_TOKEN_RATIO_ZH);
1428
}
1529

16-
export function needsChunking(content: string): boolean {
17-
return estimateTokens(content) > MAX_OUTPUT_TOKENS;
30+
export function needsChunking(
31+
content: string,
32+
model: DeepSeekModel = 'deepseek-chat',
33+
): boolean {
34+
return estimateTokens(content) > getMaxOutputTokens(model);
1835
}
1936

2037
// Split text into chunks that respect markdown structure and heading hierarchy
21-
export function splitIntoChunks(content: string): string[] {
38+
export function splitIntoChunks(
39+
content: string,
40+
model: DeepSeekModel = 'deepseek-chat',
41+
): string[] {
42+
const maxOutputTokens = getMaxOutputTokens(model);
43+
2244
// Define a regex pattern for markdown headings (## Heading)
2345
const headingPattern = /^(#{2,}) /gm;
2446

@@ -65,7 +87,7 @@ export function splitIntoChunks(content: string): string[] {
6587
for (const section of sections) {
6688
const sectionTokens = estimateTokens(section);
6789

68-
if (currentTokens + sectionTokens > MAX_OUTPUT_TOKENS) {
90+
if (currentTokens + sectionTokens > maxOutputTokens) {
6991
// If adding this section would exceed the limit, start a new chunk
7092
chunks.push(currentChunk);
7193
currentChunk = section;

packages/translate/src/index.ts

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import { Command } from 'commander';
44
import { getConfig } from './config';
55
import { logger } from './logger';
66
import { main } from './main';
7-
import type { MainConfig } from './types';
7+
import type { DeepSeekModel, MainConfig } from './types';
88

99
export type Config = MainConfig | MainConfig[];
1010

@@ -41,6 +41,10 @@ program
4141
'--concurrency <number>',
4242
'Number of concurrent translation tasks (default: 10)',
4343
)
44+
.option(
45+
'-m, --model <model>',
46+
'DeepSeek model to use: "deepseek-chat" or "deepseek-reasoner" (default: "deepseek-chat")',
47+
)
4448
.action(
4549
async (options: {
4650
config?: string;
@@ -51,6 +55,7 @@ program
5155
targetLanguage?: string;
5256
max?: number;
5357
concurrency?: number;
58+
model?: DeepSeekModel;
5459
}) => {
5560
if (options.verbose) {
5661
logger.setVerbose(true);
@@ -66,6 +71,7 @@ program
6671
...(options.docsPath ? { docsPath: options.docsPath } : {}),
6772
...(options.max ? { max: options.max } : {}),
6873
...(options.concurrency ? { concurrency: options.concurrency } : {}),
74+
...(options.model ? { model: options.model } : {}),
6975
verbose: options.verbose,
7076
listOnly: options.listOnly,
7177
targetLanguage: options.targetLanguage,

packages/translate/src/main.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ export async function main({
2424
targetLanguage,
2525
concurrency = 10,
2626
verbose,
27+
model = 'deepseek-chat',
2728
}: MainConfig): Promise<void> {
2829
// Filter languages based on targetLanguage if specified
2930
const filteredLangs = targetLanguage
@@ -211,6 +212,7 @@ export async function main({
211212
targetPath: task.targetPath,
212213
langConfig,
213214
docsContext,
215+
model,
214216
});
215217

216218
completedRefDocs++;

packages/translate/src/openai.ts

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
import OpenAI from 'openai';
22
import type { ChatCompletionMessageParam } from 'openai/resources.mjs';
33
import {
4-
MAX_OUTPUT_TOKENS,
54
estimateTokens,
5+
getMaxOutputTokens,
66
needsChunking,
77
splitIntoChunks,
88
} from './chunk';
99
import { logger } from './logger';
10+
import type { DeepSeekModel } from './types';
1011
import { type Usage, addUsage } from './usage';
1112

1213
interface LangConfig {
@@ -18,6 +19,7 @@ interface TranslateDocumentParams {
1819
content: string;
1920
langConfig: LangConfig;
2021
context?: string;
22+
model?: DeepSeekModel;
2123
}
2224

2325
// Initialize OpenAI client if API key is available
@@ -46,6 +48,7 @@ async function translateChunk(
4648
langConfig: LangConfig,
4749
context: string,
4850
needsFrontmatterRules = true,
51+
modelName: DeepSeekModel = 'deepseek-chat',
4952
): Promise<string> {
5053
if (!openai) {
5154
throw new Error('OPENAI_API_KEY is not set.');
@@ -341,9 +344,9 @@ The next message contains the COMPLETE original text that needs to be translated
341344
// console.log(chunk);
342345

343346
const response = await openai.chat.completions.create({
344-
model: model,
345-
max_completion_tokens: MAX_OUTPUT_TOKENS,
346-
max_tokens: MAX_OUTPUT_TOKENS,
347+
model: modelName,
348+
max_completion_tokens: getMaxOutputTokens(modelName),
349+
max_tokens: getMaxOutputTokens(modelName),
347350
messages: messages,
348351
});
349352

@@ -363,6 +366,7 @@ export async function $translateDocument({
363366
content,
364367
langConfig,
365368
context = '',
369+
model: modelName = 'deepseek-chat',
366370
}: TranslateDocumentParams): Promise<string> {
367371
if (!openai) {
368372
throw new Error('OPENAI_API_KEY is not set.');
@@ -374,14 +378,14 @@ export async function $translateDocument({
374378
);
375379

376380
// For small documents, use the direct approach
377-
if (!needsChunking(content)) {
378-
return await translateChunk(content, langConfig, context, true);
381+
if (!needsChunking(content, modelName)) {
382+
return await translateChunk(content, langConfig, context, true, modelName);
379383
}
380384

381385
logger.debug(
382386
'Document is large, splitting into chunks for multi-round translation',
383387
);
384-
const chunks = splitIntoChunks(content);
388+
const chunks = splitIntoChunks(content, modelName);
385389
logger.debug(`Split document into ${chunks.length} chunks`);
386390

387391
let translatedContent = '';
@@ -395,6 +399,7 @@ export async function $translateDocument({
395399
langConfig,
396400
context,
397401
i === 0,
402+
modelName,
398403
);
399404

400405
// Add to the complete translated content

packages/translate/src/types.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
export type DeepSeekModel = 'deepseek-chat' | 'deepseek-reasoner';
2+
13
export interface LangConfig {
24
locale: string;
35
name: string;
@@ -30,6 +32,7 @@ export interface MainConfig {
3032
targetLanguage?: string;
3133
concurrency?: number;
3234
verbose?: boolean;
35+
model?: DeepSeekModel;
3336
}
3437

3538
export interface TranslationResult {

packages/translate/src/utils.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import matter from 'gray-matter';
66
import { needsChunking, splitIntoChunks } from './chunk';
77
import { logger } from './logger';
88
import { $translateDocument } from './openai';
9+
import type { DeepSeekModel } from './types';
910

1011
interface LangConfig {
1112
name: string;
@@ -167,13 +168,15 @@ interface TranslateDocumentFileParams {
167168
targetPath: string;
168169
langConfig: LangConfig;
169170
docsContext?: string;
171+
model?: DeepSeekModel;
170172
}
171173

172174
export async function translateDoc({
173175
sourcePath,
174176
targetPath,
175177
langConfig,
176178
docsContext,
179+
model = 'deepseek-chat',
177180
}: TranslateDocumentFileParams) {
178181
// Create directory if it doesn't exist
179182
logger.debug(`Translating ${sourcePath} to ${targetPath}`);
@@ -192,6 +195,7 @@ export async function translateDoc({
192195
content: sourceContent,
193196
langConfig,
194197
context: translationContext,
198+
model,
195199
});
196200

197201
// Format as ISO strings (UTC)

packages/translate/tests/unit/chunk.test.ts

Lines changed: 47 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,27 @@
1-
import { describe, expect, it, vi } from 'vitest';
1+
import { describe, expect, it } from 'vitest';
22
import {
3-
CHAR_TO_TOKEN_RATIO,
43
CHAR_TO_TOKEN_RATIO_ZH,
5-
MAX_CHUNK_SIZE_TOKENS,
6-
MAX_INPUT_TOKENS,
7-
MAX_OUTPUT_TOKENS,
4+
MAX_OUTPUT_TOKENS_CHAT,
5+
MAX_OUTPUT_TOKENS_REASONER,
86
estimateTokens,
7+
getMaxOutputTokens,
98
needsChunking,
109
splitIntoChunks,
1110
} from '../../src/chunk';
1211

1312
describe('chunk', () => {
13+
describe('getMaxOutputTokens', () => {
14+
it('should return correct token limits for deepseek-chat', () => {
15+
expect(getMaxOutputTokens('deepseek-chat')).toBe(MAX_OUTPUT_TOKENS_CHAT);
16+
});
17+
18+
it('should return correct token limits for deepseek-reasoner', () => {
19+
expect(getMaxOutputTokens('deepseek-reasoner')).toBe(
20+
MAX_OUTPUT_TOKENS_REASONER,
21+
);
22+
});
23+
});
24+
1425
describe('estimateTokens', () => {
1526
it('should estimate tokens based on content length', () => {
1627
// Create test strings of different lengths
@@ -34,23 +45,41 @@ describe('chunk', () => {
3445
});
3546

3647
describe('needsChunking', () => {
37-
it('should return true for content exceeding MAX_OUTPUT_TOKENS', () => {
38-
// Create a string that would exceed the MAX_OUTPUT_TOKENS
39-
// MAX_OUTPUT_TOKENS / CHAR_TO_TOKEN_RATIO_ZH gives us the number of characters needed
48+
it('should return true for content exceeding MAX_OUTPUT_TOKENS for deepseek-chat', () => {
49+
// Create a string that would exceed the MAX_OUTPUT_TOKENS_CHAT
50+
// MAX_OUTPUT_TOKENS_CHAT / CHAR_TO_TOKEN_RATIO_ZH gives us the number of characters needed
51+
const exceedMaxTokens = 'a'.repeat(
52+
Math.ceil(MAX_OUTPUT_TOKENS_CHAT / CHAR_TO_TOKEN_RATIO_ZH) + 1000,
53+
);
54+
55+
expect(needsChunking(exceedMaxTokens, 'deepseek-chat')).toBe(true);
56+
});
57+
58+
it('should return false for content within MAX_OUTPUT_TOKENS for deepseek-chat', () => {
59+
// Create a string that would be below the MAX_OUTPUT_TOKENS_CHAT
60+
const withinMaxTokens = 'a'.repeat(
61+
Math.ceil(MAX_OUTPUT_TOKENS_CHAT / CHAR_TO_TOKEN_RATIO_ZH / 2),
62+
);
63+
64+
expect(needsChunking(withinMaxTokens, 'deepseek-chat')).toBe(false);
65+
});
66+
67+
it('should return true for content exceeding MAX_OUTPUT_TOKENS for deepseek-reasoner', () => {
68+
// Create a string that would exceed the MAX_OUTPUT_TOKENS_REASONER
4069
const exceedMaxTokens = 'a'.repeat(
41-
Math.ceil(MAX_OUTPUT_TOKENS / CHAR_TO_TOKEN_RATIO_ZH) + 1000,
70+
Math.ceil(MAX_OUTPUT_TOKENS_REASONER / CHAR_TO_TOKEN_RATIO_ZH) + 1000,
4271
);
4372

44-
expect(needsChunking(exceedMaxTokens)).toBe(true);
73+
expect(needsChunking(exceedMaxTokens, 'deepseek-reasoner')).toBe(true);
4574
});
4675

47-
it('should return false for content within MAX_OUTPUT_TOKENS', () => {
48-
// Create a string that would be below the MAX_OUTPUT_TOKENS
76+
it('should return false for content within MAX_OUTPUT_TOKENS for deepseek-reasoner', () => {
77+
// Create a string that would be below the MAX_OUTPUT_TOKENS_REASONER
4978
const withinMaxTokens = 'a'.repeat(
50-
Math.ceil(MAX_OUTPUT_TOKENS / CHAR_TO_TOKEN_RATIO_ZH / 2),
79+
Math.ceil(MAX_OUTPUT_TOKENS_REASONER / CHAR_TO_TOKEN_RATIO_ZH / 2),
5180
);
5281

53-
expect(needsChunking(withinMaxTokens)).toBe(false);
82+
expect(needsChunking(withinMaxTokens, 'deepseek-reasoner')).toBe(false);
5483
});
5584
});
5685

@@ -72,7 +101,7 @@ Content for section 2.
72101
73102
More content.`;
74103

75-
const chunks = splitIntoChunks(content);
104+
const chunks = splitIntoChunks(content, 'deepseek-chat');
76105

77106
// The current implementation doesn't split by markdown headings as expected
78107
// so we're testing the actual behavior
@@ -87,10 +116,10 @@ More content.`;
87116
it('should handle large sections with the current implementation', () => {
88117
// Create a very large section without headings
89118
const largeSection = 'a'.repeat(
90-
Math.ceil((MAX_OUTPUT_TOKENS / CHAR_TO_TOKEN_RATIO_ZH) * 3),
119+
Math.ceil((MAX_OUTPUT_TOKENS_CHAT / CHAR_TO_TOKEN_RATIO_ZH) * 3),
91120
);
92121

93-
const chunks = splitIntoChunks(largeSection);
122+
const chunks = splitIntoChunks(largeSection, 'deepseek-chat');
94123

95124
// The current implementation returns a single large chunk
96125
expect(chunks.length).toBeGreaterThanOrEqual(1);
@@ -100,7 +129,7 @@ More content.`;
100129
});
101130

102131
it('should handle empty content', () => {
103-
const chunks = splitIntoChunks('');
132+
const chunks = splitIntoChunks('', 'deepseek-chat');
104133
expect(chunks).toEqual([]);
105134
});
106135
});

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy