# !pip install -qU requests datasets huggingface_hub tqdm
SpID Eval
I took lots of photos of cool spiders in Zimbabwe and neighboring countries. I figured they’d make a nice hard eval for multi-modal models! This notebook scrapes together 100 observations of mine and wraps them up in a nice huggingface dataset to use for future research. I also have several private versions (made in a similar fashion but different selection criteria) that I’ll keep secret to avoid training data contamination.
# from huggingface_hub import notebook_login
# notebook_login()
# inat request helpers
= "https://api.inaturalist.org/v1/observations"
INAT_OBS_URL = "https://api.inaturalist.org/v1/taxa"
INAT_TAXA_URL
# project‑specific knobs
= 113055 # “Southern Africa” on iNat
PLACE_ID = "jonathan_whitaker"
USER_ID = 500 # over‑collect because we’ll discard some
RAW_TARGET = 100
FINAL_TARGET = 4 # 4 wrong + 1 correct = 5‑choice MCQ DISTRACTORS
import requests, random, itertools, tqdm, time
def inat_observations(page=1, per_page=100):
= dict(user_id=USER_ID,
params =PLACE_ID,
place_id=per_page,
per_page=page,
page="research", # only research‑grade
quality_grade="observed_on",
order_by="desc")
orderreturn requests.get(INAT_OBS_URL, params=params, timeout=30).json()["results"]
def harvest_raw(n=RAW_TARGET):
"""Pull pages until we’ve got at least n usable obs."""
= []
good = 1
page while len(good) < n:
for obs in inat_observations(page):
= obs.get("taxon") or {}
taxon if taxon.get("rank") != "species": # skip higher‑level IDs
continue
if not obs.get("photos"): # need an image
continue
good.append(obs)if len(good) >= n: break
+= 1
page 0.2) # be polite
time.sleep(return good[:n]
= harvest_raw()
raw_obs print(f"Collected {len(raw_obs)} candidate observations")
Collected 500 candidate observations
# raw_obs[0]
# raw_obs[0]['taxon']
def get_genera_in_family(family_id, exclude_id, k=DISTRACTORS):
= dict(rank='genus',
params =family_id,
parent_id=PLACE_ID,
place_id=50)
per_page= requests.get(INAT_TAXA_URL, params=params, timeout=30).json()["results"]
genera = [gen for gen in genera if gen['id'] != exclude_id]
pool = random.sample(pool, k=min(k, len(pool))) if pool else []
picks return picks
def get_species_in_genus(genus_id, exclude_id, k=DISTRACTORS):
= dict(rank='species',
params =genus_id,
parent_id=PLACE_ID,
place_id=50)
per_page= requests.get(INAT_TAXA_URL, params=params, timeout=30).json()["results"]
species = [sp for sp in species if sp['id'] != exclude_id]
pool = random.sample(pool, k=min(k, len(pool))) if pool else []
picks return [sp['name'] for sp in picks]
def get_species_in_family(family_id, exclude_id, k=DISTRACTORS):
= get_genera_in_family(family_id, exclude_id)
genera if not genera:
return [] # Return an empty list if no genera are found
= []
chosen_species for genus in random.sample(genera, min(len(genera), k)):
= get_species_in_genus(genus['id'], exclude_id, k=1)
species if species:
0])
chosen_species.append(species[return chosen_species
# quick test on the first observation
= raw_obs[0]
first = first['taxon']['ancestor_ids'][-3]
fam_id 'taxon']['id']) get_species_in_family(fam_id,first[
['Augusta glyphica',
'Gasteracantha falcicornis',
'Afracantha camerunensis',
'Gastroxya schoutedeni']
def observation_to_row(obs):
= obs['taxon']
taxon = taxon['name']
correct = taxon['ancestor_ids'][-3]
family_id = get_species_in_family(family_id, taxon['id'])
distractors = distractors + [correct]
options
random.shuffle(options)= options.index(correct)
answer_idx
# medium‑sized iNat URLs are built by swapping 'square' for 'medium' in the template
= obs['photos'][0]['url'].replace('square', 'medium')
photo_url
return dict(
= photo_url,
image = options,
options = answer_idx,
answer = obs['id']
observation_id )
# prompt: Do this in a loop with tqdm and error handling, and only stop when FINAL_TARGET obs work
= []
dataset_rows with tqdm.tqdm(total=FINAL_TARGET) as pbar:
for o in raw_obs:
0.5) # TO be gentle on their API
time.sleep(try:
= observation_to_row(o)
row # Filter out those without distractors
if not row['options'] or len(row['options']) < 3: # I'll allow some with fewer options
continue
dataset_rows.append(row)1)
pbar.update(if len(dataset_rows) == FINAL_TARGET:
break # Stop when FINAL_TARGET observations are processed
except Exception as e:
# print(f"Error processing observation: {e}")
continue # Skip to the next observation if an error occurs
len(dataset_rows), dataset_rows[0]
100%|██████████| 100/100 [11:37<00:00, 6.97s/it]
(100,
{'image': 'https://inaturalist-open-data.s3.amazonaws.com/photos/281357140/medium.jpeg',
'options': ['Isoxya stuhlmanni',
'Macracantha arcuata',
'Actinacantha globulata',
'Gasteracantha milvoides',
'Parmatergus lens'],
'answer': 0,
'observation_id': 162709444})
from datasets import Dataset, Features, Value, Sequence, ClassLabel, Image
= Features({
features "observation_id": Value("int32"),
"image": Image(),
"options": Sequence(Value("string")),
"answer": Value("int32"),
})
= Dataset.from_list(dataset_rows, features=features)
ds = ds.shuffle(seed=42) # optional
ds 0] # show one row ds[
{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=375x500>,
'options': ['Trioceros rudis',
'Nadzikambia mlanjensis',
'Rieppeleon kerstenii',
'Chamaeleo gracilis',
'Rhampholeon marshalli'],
'answer': 4,
'observation_id': 112489466}
from huggingface_hub import HfApi
from google.colab import userdata
= "johnowhitaker/specid_eval_jw"
REPO_ID = userdata.get('hf_token')
HF_TOKEN
# 1) create the repo once (skip if already exists)
=HF_TOKEN).create_repo(repo_id=REPO_ID, repo_type="dataset", private=False, exist_ok=True)
HfApi(token
# 2) push
=HF_TOKEN) ds.push_to_hub(REPO_ID, token
/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
CommitInfo(commit_url='https://huggingface.co/datasets/johnowhitaker/specid_eval_jw/commit/a58934a1b93e113d7a6756632ded7c4bf953c645', commit_message='Upload dataset', commit_description='', oid='a58934a1b93e113d7a6756632ded7c4bf953c645', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/johnowhitaker/specid_eval_jw', endpoint='https://huggingface.co', repo_type='dataset', repo_id='johnowhitaker/specid_eval_jw'), pr_revision=None, pr_num=None)
from datasets import load_dataset
= load_dataset(REPO_ID, split="train")
demo print(demo[0]["options"], " -> correct idx:", demo[0]["answer"])
['Trioceros rudis', 'Nadzikambia mlanjensis', 'Rieppeleon kerstenii', 'Chamaeleo gracilis', 'Rhampholeon marshalli'] -> correct idx: 4
0]["image"] demo[