Merge pull request #558 from jikan-me/feature/incremental-indexing

🎉 Added incremental indexer command
This commit is contained in:
pushrbx 2024-11-08 19:35:24 +00:00 committed by GitHub
commit 52c030da4a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 257 additions and 7 deletions

View File

@ -14,6 +14,7 @@ For an entire list of commands, you can run `php artisan list`
- [Indexer](#indexer) - [Indexer](#indexer)
- [Anime](#indexer-anime) - [Anime](#indexer-anime)
- [Manga](#indexer-manga) - [Manga](#indexer-manga)
- [Incremental](#indexer-incremental)
## Commands ## Commands
@ -98,7 +99,7 @@ This function only needs to be run once. Any entry's cache updating will automat
Command: Command:
``` ```
indexer:anime indexer:manga
{--failed : Run only entries that failed to index last time} {--failed : Run only entries that failed to index last time}
{--resume : Resume from the last position} {--resume : Resume from the last position}
{--reverse : Start from the end of the array} {--reverse : Start from the end of the array}
@ -109,3 +110,16 @@ indexer:anime
Example: `indexer:manga` Example: `indexer:manga`
This simply translates to running the indexer without any additional configuration. This simply translates to running the indexer without any additional configuration.
#### Indexer: Incremental
Incrementally indexes media entries from MAL.
This command will compare the latest version of MAL ids from the [mal_id_cache](https://github.com/purarue/mal-id-cache)
github repository and compares them with the downloaded ids from the previous run. If no ids found from the previous run, a full indexing session is started.
Command:
```
indexer:incremental {mediaType*}
{--failed : Run only entries that failed to index last time}
{--resume : Resume from the last position}
{--delay=3 : Set a delay between requests}
```

View File

@ -2,7 +2,6 @@
namespace App\Console\Commands\Indexer; namespace App\Console\Commands\Indexer;
use App\Exceptions\Console\CommandAlreadyRunningException;
use App\Exceptions\Console\FileNotFoundException; use App\Exceptions\Console\FileNotFoundException;
use Illuminate\Console\Command; use Illuminate\Console\Command;
use Illuminate\Support\Facades\Storage; use Illuminate\Support\Facades\Storage;

View File

@ -0,0 +1,228 @@
<?php
namespace App\Console\Commands\Indexer;
use Illuminate\Console\Command;
use Illuminate\Support\Facades\Storage;
use Illuminate\Support\Facades\Validator;
class IncrementalIndexer extends Command
{
/**
* @var bool
*/
private bool $cancelled = false;
/**
* The name and signature of the console command.
*
* @var string
*/
protected $signature = 'indexer:incremental {mediaType*}
{--delay=3 : Set a delay between requests}
{--resume : Resume from the last position}
{--failed : Run only entries that failed to index last time}';
protected function promptForMissingArgumentsUsing(): array
{
return [
'mediaType' => ['The media type to index.', 'Valid values: anime, manga']
];
}
private function getExistingIds(string $mediaType): array
{
$existingIdsHash = "";
$existingIdsRaw = "";
if (Storage::exists("indexer/incremental/$mediaType.json"))
{
$existingIdsRaw = Storage::get("indexer/incremental/$mediaType.json");
$existingIdsHash = sha1($existingIdsRaw);
}
return [$existingIdsHash, $existingIdsRaw];
}
private function getIdsToFetch(string $mediaType): array
{
$idsToFetch = [];
[$existingIdsHash, $existingIdsRaw] = $this->getExistingIds($mediaType);
if ($this->cancelled)
{
return [];
}
$newIdsRaw = file_get_contents("https://raw.githubusercontent.com/purarue/mal-id-cache/master/cache/${mediaType}_cache.json");
$newIdsHash = sha1($newIdsRaw);
/** @noinspection PhpConditionAlreadyCheckedInspection */
if ($this->cancelled)
{
return [];
}
if ($newIdsHash !== $existingIdsHash)
{
$newIds = json_decode($newIdsRaw, true);
$existingIds = json_decode($existingIdsRaw, true);
if (is_null($existingIds) || count($existingIds) === 0)
{
$idsToFetch = $newIds;
}
else
{
foreach (["sfw", "nsfw"] as $t)
{
$idsToFetch[$t] = array_diff($existingIds[$t], $newIds[$t]);
}
}
Storage::put("indexer/incremental/$mediaType.json.tmp", $newIdsRaw);
}
return $idsToFetch;
}
private function getFailedIdsToFetch(string $mediaType): array
{
return json_decode(Storage::get("indexer/incremental/{$mediaType}_failed.json"));
}
private function fetchIds(string $mediaType, array $idsToFetch, bool $resume): void
{
$index = 0;
$success = [];
$failedIds = [];
$idCount = count($idsToFetch);
if ($resume && Storage::exists("indexer/incremental/{$mediaType}_resume.save"))
{
$index = (int)Storage::get("indexer/incremental/{$mediaType}_resume.save");
$this->info("Resuming from index: $index");
}
$ids = array_merge($idsToFetch['sfw'], $idsToFetch['nsfw']);
if ($index > 0 && !isset($ids[$index]))
{
$index = 0;
$this->warn('Invalid index; set back to 0');
}
Storage::put("indexer/incremental/{$mediaType}_resume.save", 0);
$this->info("$idCount $mediaType entries available");
for ($i = $index; $i <= ($idCount - 1); $i++)
{
if ($this->cancelled)
{
return;
}
$id = $ids[$index];
$url = env('APP_URL') . "/v4/$mediaType/$id";
$this->info("Indexing/Updating " . ($i + 1) . "/$idCount $url [MAL ID: $id]");
try
{
$response = json_decode(file_get_contents($url), true);
if (!isset($response['error']) || $response['status'] == 404)
{
continue;
}
$this->error("[SKIPPED] Failed to fetch $url - {$response['error']}");
}
catch (\Exception)
{
$this->warn("[SKIPPED] Failed to fetch $url");
$failedIds[] = $id;
Storage::put("indexer/incremental/$mediaType.failed", json_encode($failedIds));
}
$success[] = $id;
Storage::put("indexer/incremental/{$mediaType}_resume.save", $index);
}
Storage::delete("indexer/incremental/{$mediaType}_resume.save");
$this->info("--- Indexing of $mediaType is complete.");
$this->info(count($success) . ' entries indexed or updated.');
if (count($failedIds) > 0)
{
$this->info(count($failedIds) . ' entries failed to index or update. Re-run with --failed to requeue failed entries only.');
}
// finalize the latest state
Storage::move("indexer/incremental/$mediaType.json.tmp", "indexer/incremental/$mediaType.json");
}
public function handle(): int
{
// validate inputs
$validator = Validator::make(
[
'mediaType' => $this->argument('mediaType'),
'delay' => $this->option('delay'),
'resume' => $this->option('resume') ?? false,
'failed' => $this->option('failed') ?? false
],
[
'mediaType' => 'required|in:anime,manga',
'delay' => 'integer|min:1',
'resume' => 'bool|prohibited_with:failed',
'failed' => 'bool|prohibited_with:resume'
]
);
if ($validator->fails()) {
$this->error($validator->errors()->toJson());
return 1;
}
// we want to handle signals from the OS
$this->trap([SIGTERM, SIGQUIT, SIGINT], fn () => $this->cancelled = true);
$resume = $this->option('resume') ?? false;
$onlyFailed = $this->option('failed') ?? false;
/**
* @var $mediaTypes array
*/
$mediaTypes = $this->argument("mediaType");
foreach ($mediaTypes as $mediaType)
{
$idsToFetch = [];
// if "--failed" option is specified just run the failed ones
if ($onlyFailed && Storage::exists("indexer/incremental/{$mediaType}_failed.json"))
{
$idsToFetch["sfw"] = $this->getFailedIdsToFetch($mediaType);
}
else
{
$idsToFetch = $this->getIdsToFetch($mediaType);
}
if ($this->cancelled)
{
return 127;
}
$idCount = count($idsToFetch);
if ($idCount === 0)
{
continue;
}
$this->fetchIds($mediaType, $idsToFetch, $resume);
}
return 0;
}
}

View File

@ -24,7 +24,8 @@ class Kernel extends ConsoleKernel
Indexer\GenreIndexer::class, Indexer\GenreIndexer::class,
Indexer\ProducersIndexer::class, Indexer\ProducersIndexer::class,
Indexer\AnimeSweepIndexer::class, Indexer\AnimeSweepIndexer::class,
Indexer\MangaSweepIndexer::class Indexer\MangaSweepIndexer::class,
Indexer\IncrementalIndexer::class
]; ];
/** /**

View File

@ -3,8 +3,6 @@
namespace App\Features; namespace App\Features;
use App\Dto\QuerySpecificAnimeSeasonCommand; use App\Dto\QuerySpecificAnimeSeasonCommand;
use App\Enums\AnimeSeasonEnum;
use App\Enums\AnimeStatusEnum;
use App\Enums\AnimeTypeEnum; use App\Enums\AnimeTypeEnum;
use Illuminate\Contracts\Database\Query\Builder; use Illuminate\Contracts\Database\Query\Builder;
use Illuminate\Support\Carbon; use Illuminate\Support\Carbon;

View File

@ -14,6 +14,7 @@
"php": "^8.1", "php": "^8.1",
"ext-json": "*", "ext-json": "*",
"ext-mongodb": "*", "ext-mongodb": "*",
"ext-pcntl": "*",
"amphp/http-client": "^4.6", "amphp/http-client": "^4.6",
"danielmewes/php-rql": "dev-master", "danielmewes/php-rql": "dev-master",
"darkaonline/swagger-lume": "^9.0", "darkaonline/swagger-lume": "^9.0",

View File

@ -34,6 +34,7 @@ display_help() {
echo "stop Stop Jikan API" echo "stop Stop Jikan API"
echo "validate-prereqs Validate pre-reqs installed (docker, docker-compose)" echo "validate-prereqs Validate pre-reqs installed (docker, docker-compose)"
echo "execute-indexers Execute the indexers, which will scrape and index data from MAL. (Notice: This can take days)" echo "execute-indexers Execute the indexers, which will scrape and index data from MAL. (Notice: This can take days)"
echo "index-incrementally Executes the incremental indexers for each media type. (anime, manga)"
echo "" echo ""
} }
@ -168,6 +169,10 @@ case "$1" in
$DOCKER_COMPOSE_CMD -p "$DOCKER_COMPOSE_PROJECT_NAME" exec jikan_rest php /app/artisan indexer:producers $DOCKER_COMPOSE_CMD -p "$DOCKER_COMPOSE_PROJECT_NAME" exec jikan_rest php /app/artisan indexer:producers
echo "Indexing done!" echo "Indexing done!"
;; ;;
"index-incrementally")
echo "Indexing..."
$DOCKER_COMPOSE_CMD -p "$DOCKER_COMPOSE_PROJECT_NAME" exec jikan_rest php /app/artisan indexer:incremental anime manga
echo "Indexing done!"
*) *)
echo "No command specified, displaying help" echo "No command specified, displaying help"
display_help display_help

View File

@ -16,6 +16,9 @@ This will:
> **Note**: The script supports both `docker` and `podman`. In case of `podman` please bare in mind that sometimes the container name resolution doesn't work on the container network. > **Note**: The script supports both `docker` and `podman`. In case of `podman` please bare in mind that sometimes the container name resolution doesn't work on the container network.
> In those cases you might have to install `aardvark-dns` package. On `Arch Linux` podman uses `netavark` network by default (in 2023) so you will need to install the before mentioned package. > In those cases you might have to install `aardvark-dns` package. On `Arch Linux` podman uses `netavark` network by default (in 2023) so you will need to install the before mentioned package.
> **Note 2**: The script will start the jikan API, but if you start it for the first time, it won't have any data in it!
> You will have to run the indexers through artisan to have data. See ["Running the indexer with the script"](#running-the-indexer-with-the-script) section.
The script has the following prerequisites and will notify you if these are not present: The script has the following prerequisites and will notify you if these are not present:
- git - git
@ -36,6 +39,7 @@ start Start Jikan API (mongodb, typesense, redis, jikan-api wor
stop Stop Jikan API stop Stop Jikan API
validate-prereqs Validate pre-reqs installed (docker, docker-compose) validate-prereqs Validate pre-reqs installed (docker, docker-compose)
execute-indexers Execute the indexers, which will scrape and index data from MAL. (Notice: This can take days) execute-indexers Execute the indexers, which will scrape and index data from MAL. (Notice: This can take days)
index-incrementally Executes the incremental indexers for each media type. (anime, manga)
``` ```
### Running the indexer with the script ### Running the indexer with the script