Skip to content

Commit

Permalink
feat: replace js-tiktoken with php-tiktoken
Browse files Browse the repository at this point in the history
  • Loading branch information
HardeepAsrani committed Oct 8, 2024
1 parent 36c3685 commit 8f469a7
Show file tree
Hide file tree
Showing 8 changed files with 196 additions and 157 deletions.
3 changes: 2 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
"codeinwp/themeisle-sdk": "^3.3",
"hkulekci/qdrant": "^0.5.7",
"symfony/http-client": "^6.4",
"nyholm/psr7": "^1.8"
"nyholm/psr7": "^1.8",
"yethee/tiktoken": "^0.6.0"
}
}
53 changes: 52 additions & 1 deletion composer.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

34 changes: 17 additions & 17 deletions inc/API.php
Original file line number Diff line number Diff line change
Expand Up @@ -461,19 +461,19 @@ public function get_data( $request ) {
* @throws \Exception If Qdrant API fails.
*/
public function add_data( $request ) {
$data = $request->get_param( 'data' );
$content = apply_filters( 'the_content', get_post_field( 'post_content', $data['post_id'] ) );
$chunks = str_split( $content, 2000 );

$moderation = $this->moderate( $chunks, $data['post_id'] );
$data = $request->get_param( 'data' );
$post_id = $data['ID'];
$data = $this->tokenize( $data );
$chunks = array_column( $data, 'post_content' );
$moderation = $this->moderate( $chunks, $post_id );

if ( is_wp_error( $moderation ) ) {
return rest_ensure_response( [ 'error' => $this->get_error_message( $moderation ) ] );
}

if ( true !== $moderation && 'override' !== $request->get_param( 'action' ) ) {
update_post_meta( $data['post_id'], '_hyve_moderation_failed', 1 );
update_post_meta( $data['post_id'], '_hyve_moderation_review', $moderation );
update_post_meta( $post_id, '_hyve_moderation_failed', 1 );
update_post_meta( $post_id, '_hyve_moderation_review', $moderation );

return rest_ensure_response(
[
Expand All @@ -487,7 +487,7 @@ public function add_data( $request ) {
if ( 'update' === $request->get_param( 'action' ) ) {
if ( Qdrant_API::is_active() ) {
try {
$delete_result = Qdrant_API::instance()->delete_point( $data['post_id'] );
$delete_result = Qdrant_API::instance()->delete_point( $post_id );

if ( ! $delete_result ) {
throw new \Exception( __( 'Failed to delete point in Qdrant.', 'hyve-lite' ) );
Expand All @@ -497,18 +497,18 @@ public function add_data( $request ) {
}
}

$this->table->delete_by_post_id( $data['post_id'] );

delete_post_meta( $data['post_id'], '_hyve_needs_update' );
$this->table->delete_by_post_id( $post_id );
delete_post_meta( $post_id, '_hyve_needs_update' );
}

$post_id = $this->table->insert( $data );

update_post_meta( $data['post_id'], '_hyve_added', 1 );
delete_post_meta( $data['post_id'], '_hyve_moderation_failed' );
delete_post_meta( $data['post_id'], '_hyve_moderation_review' );
foreach ( $data as $datum ) {
$id = $this->table->insert( $datum );
$this->table->process_post( $id );
}

$this->table->process_post( $post_id );
update_post_meta( $post_id, '_hyve_added', 1 );
delete_post_meta( $post_id, '_hyve_moderation_failed' );
delete_post_meta( $post_id, '_hyve_moderation_review' );

return rest_ensure_response( true );
}
Expand Down
98 changes: 98 additions & 0 deletions inc/BaseAPI.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
use ThemeIsle\HyveLite\Main;
use ThemeIsle\HyveLite\DB_Table;
use ThemeIsle\HyveLite\OpenAI;
use Yethee\Tiktoken\EncoderProvider;

/**
* BaseAPI class.
Expand Down Expand Up @@ -157,4 +158,101 @@ public function moderate( $chunks, $id = null ) {

return $return;
}

/**
* Tokenize data.
*
* @param array $post Post data.
*
* @return array
*/
public function tokenize( $post ) {
$provider = new EncoderProvider();
$encoder = $provider->get( 'cl100k_base' );

$content = preg_replace( '/<[^>]+>/', '', $post['content'] );
$tokens = $encoder->encode( $content );

$article = [
'post_id' => $post['ID'] ?? null,
'post_title' => $post['title'],
'post_content' => $post['content'],
'tokens' => $tokens,
];

$data = [];

$chunked_token_size = 1000;
$token_length = count( $tokens );

if ( $token_length > $chunked_token_size ) {
$shortened_sentences = $this->create_chunks( $content, $chunked_token_size );

foreach ( $shortened_sentences as $shortened_sentence ) {
$chunked_tokens = $encoder->encode( $post['title'] . ' ' . $shortened_sentence );

$data[] = [
'post_id' => $article['post_id'],
'post_title' => $article['post_title'],
'post_content' => $shortened_sentence,
'tokens' => $chunked_tokens,
'token_count' => count( $chunked_tokens ),
];
}
} else {
$chunked_tokens = $encoder->encode( $post['title'] . ' ' . $content );

$data[] = [
'post_id' => $article['post_id'],
'post_title' => $article['post_title'],
'post_content' => $article['post_content'],
'tokens' => $chunked_tokens,
'token_count' => count( $chunked_tokens ),
];
}

return $data;
}

/**
* Create Chunks.
*
* @param string $text Text to chunk.
* @param int $size Chunk size.
*
* @return array
*/
public function create_chunks( $text, $size = 1000 ) {
$provider = new EncoderProvider();
$encoder = $provider->get( 'cl100k_base' );

$sentences = explode( '. ', $text );

$chunks = [];
$tokens_so_far = 0;
$chunk = [];

foreach ( $sentences as $sentence ) {
$token_length = count( $encoder->encode( ' ' . $sentence ) );

if ( $tokens_so_far + $token_length > $size ) {
$chunks[] = implode( '. ', $chunk ) . '.';
$chunk = [];
$tokens_so_far = 0;
}

if ( $token_length > $size ) {
continue;
}

$chunk[] = $sentence;
$tokens_so_far += $token_length + 1;
}

if ( 0 < count( $chunk ) ) {
$chunks[] = implode( '. ', $chunk ) . '.';
}

return $chunks;
}
}
11 changes: 1 addition & 10 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
"grunt-wp-readme-to-markdown": "^2.1.0",
"npm-run-all": "^4.1.5",
"replace-in-file": "^7.1.0",
"semantic-release": "^19.0.5",
"semantic-release": "^19.0.5",
"semantic-release-slack-bot": "^4.0.2",
"simple-git-hooks": "^2.9.0",
"tailwindcss": "^3.4.0"
Expand All @@ -59,7 +59,6 @@
"@dqbd/tiktoken": "^1.0.13",
"@wordpress/icons": "^9.39.0",
"classnames": "^2.5.1",
"js-tiktoken": "^1.0.10",
"object-hash": "^3.0.0"
}
}
5 changes: 0 additions & 5 deletions src/backend/parts/data/Custom.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,3 @@
/**
* External dependencies.
*/
import hash from 'object-hash';

/**
* WordPress dependencies.
*/
Expand Down
Loading

0 comments on commit 8f469a7

Please sign in to comment.