Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
B
Basedformer
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Locked Files
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Security & Compliance
Security & Compliance
Dependency List
License Compliance
Packages
Packages
List
Container Registry
Analytics
Analytics
CI / CD
Code Review
Insights
Issues
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
novelai-storage
Basedformer
Commits
91470a5a
Commit
91470a5a
authored
Jul 08, 2022
by
Eren Doğan
Committed by
GitHub
Jul 08, 2022
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #8 from NovelAI/datasetfixes
parents
0c96e233
b1e335ec
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
181 additions
and
15 deletions
+181
-15
basedformer/dataset.py
basedformer/dataset.py
+52
-15
scripts/benchmarkdataset.py
scripts/benchmarkdataset.py
+33
-0
scripts/scrapedanbooru.py
scripts/scrapedanbooru.py
+96
-0
No files found.
basedformer/dataset.py
View file @
91470a5a
...
...
@@ -9,8 +9,12 @@ import simplejpeg
import
pickle
from
pathlib
import
Path
from
PIL
import
Image
from
tqdm
import
tqdm
from
concurrent.futures
import
as_completed
import
requests
import
hashlib
import
io
import
os
# Does this work with other block_sizes? doesn't seem to.
class
FbDataset
(
data
.
Dataset
):
...
...
@@ -73,6 +77,12 @@ class ShardedImageDataset(data.Dataset):
with
open
(
self
.
dataset_path
,
mode
=
"r"
)
as
file_obj
:
self
.
mmap
=
mmap
.
mmap
(
file_obj
.
fileno
(),
length
=
0
,
access
=
mmap
.
ACCESS_READ
)
#precompute pointer lookup dict for faster random read
self
.
pointer_lookup
=
{}
for
t
in
self
.
index
:
offset
,
length
,
id
=
t
self
.
pointer_lookup
[
id
]
=
(
offset
,
length
)
#make so metadata is shardable by world_size(num_gpus)
#and batch_size
self
.
index
=
self
.
index
[:
len
(
self
.
index
)
-
(
len
(
self
.
index
)
%
(
bsz
*
world_size
))]
...
...
@@ -115,25 +125,29 @@ class ShardedImageDataset(data.Dataset):
return
data
,
id
def
read_from_id
(
self
,
id
):
#to be used standalone
offset
,
size
,
_
=
self
.
index
[
self
.
ids
==
id
][
0
]
def
read_from_id
(
self
,
id
,
decode
=
True
):
offset
,
size
=
self
.
pointer_lookup
[
id
]
data
=
self
.
mmap
[
offset
:
offset
+
size
]
data
=
decode_jpeg
(
data
)
if
decode
:
data
=
decode_jpeg
(
data
)
return
data
def
get_metadata
(
self
,
id
):
return
self
.
metadata
[
id
]
class
ImageDatasetBuilder
():
def
__init__
(
self
,
folder_path
,
name
,
dataset
=
True
,
index
=
True
,
metadata
=
False
,
threads
=
None
):
def
__init__
(
self
,
folder_path
,
name
,
dataset
=
True
,
index
=
True
,
metadata
=
False
,
threads
=
None
,
block_size
=
4096
,
align_fs_blocks
=
True
):
self
.
folder_path
=
Path
(
folder_path
)
self
.
dataset_name
=
name
+
".ds"
self
.
index_name
=
name
+
".index"
self
.
metadata_name
=
name
+
".metadata"
self
.
index_name_temp
=
name
+
".temp.index"
self
.
metadata_name_temp
=
name
+
".temp.metadata"
self
.
dataset_path
=
self
.
folder_path
/
self
.
dataset_name
self
.
index_path
=
self
.
folder_path
/
self
.
index_name
self
.
metadata_path
=
self
.
folder_path
/
self
.
metadata_name
self
.
index_path_temp
=
self
.
folder_path
/
self
.
index_name_temp
self
.
metadata_path_temp
=
self
.
folder_path
/
self
.
metadata_name_temp
self
.
open_dataset
=
dataset
self
.
open_index
=
index
self
.
open_metadata
=
metadata
...
...
@@ -141,6 +155,8 @@ class ImageDatasetBuilder():
self
.
index
=
None
self
.
metadata
=
None
self
.
threads
=
threads
self
.
block_size
=
block_size
self
.
align_fs_blocks
=
align_fs_blocks
@
property
def
is_open
(
self
):
...
...
@@ -181,8 +197,8 @@ class ImageDatasetBuilder():
self
.
folder_path
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
if
self
.
open_dataset
:
dataset
=
open
(
self
.
dataset_path
,
mode
=
"ab+"
)
dataset
.
flush
()
self
.
dataset
=
open
(
self
.
dataset_path
,
mode
=
"ab+"
)
self
.
dataset
.
flush
()
if
self
.
open_index
:
self
.
index
=
[]
...
...
@@ -217,12 +233,13 @@ class ImageDatasetBuilder():
else
:
raise
Exception
(
"Metadata file not found at {}"
.
format
(
self
.
metadata_path
))
def
operate
(
self
,
operation
,
batch
,
identities
,
metadata
=
None
,
executor
=
concurrent
.
futures
.
ThreadPoolExecutor
,
**
kwargs
):
def
operate
(
self
,
operation
,
batch
,
identities
,
metadata
=
None
,
executor
=
concurrent
.
futures
.
ThreadPoolExecutor
,
use_tqdm
=
False
,
**
kwargs
):
executor
=
executor
(
max_workers
=
self
.
threads
)
futures
=
executor
.
map
(
operation
,
batch
,
**
kwargs
)
futures
=
executor
.
map
(
operation
,
batch
)
if
use_tqdm
:
futures
=
tqdm
(
futures
,
total
=
len
(
batch
),
leave
=
False
)
futures
=
list
(
futures
)
for
data
,
identity
in
zip
(
futures
,
identities
):
self
.
write
(
data
,
identity
)
...
...
@@ -233,7 +250,8 @@ class ImageDatasetBuilder():
except
:
return
None
else
:
data
=
Image
.
open
(
io
.
BytesIO
(
image_data
))
data
=
Image
.
open
(
io
.
BytesIO
(
data
))
data
=
np
.
asarray
(
data
)
data
=
simplejpeg
.
encode_jpeg
(
data
,
quality
=
91
)
return
data
...
...
@@ -261,8 +279,17 @@ class ImageDatasetBuilder():
if
data
==
None
:
return
data_ptr
=
self
.
dataset
.
tell
()
data_len
=
len
(
data
)
self
.
index
.
append
([
data_ptr
,
data_len
,
identity
])
self
.
dataset
.
write
(
data
)
self
.
index
.
append
([
self
.
dataset
.
tell
(),
len
(
data
),
identity
])
# block align
if
self
.
align_fs_blocks
:
remainder
=
(
data_ptr
+
data_len
)
%
self
.
block_size
if
remainder
!=
0
:
self
.
dataset
.
write
(
bytearray
(
self
.
block_size
-
remainder
))
if
self
.
metadata
and
metadata
:
self
.
metadata
[
identity
]
=
metadata
...
...
@@ -277,17 +304,27 @@ class ImageDatasetBuilder():
print
(
"Warning: Index not built, couldn't flush"
)
return
with
open
(
self
.
index_path
,
'wb'
)
as
f
:
with
open
(
self
.
index_path
_temp
,
'wb'
)
as
f
:
pickle
.
dump
(
self
.
index
,
f
)
try
:
os
.
remove
(
self
.
index_path
)
except
:
pass
os
.
rename
(
self
.
index_path_temp
,
self
.
index_path
)
def
flush_metadata
(
self
,
silent
=
False
):
if
not
self
.
metadata
and
not
silent
:
print
(
"Warning: Metadata not built, couldn't flush"
)
return
with
open
(
self
.
metadata_path
,
'wb'
)
as
f
:
with
open
(
self
.
metadata_path
_temp
,
'wb'
)
as
f
:
pickle
.
dump
(
self
.
metadata
,
f
)
try
:
os
.
remove
(
self
.
metadata_path
)
except
:
pass
os
.
rename
(
self
.
metadata_path_temp
,
self
.
metadata_path
)
def
flush
(
self
,
silent
=
False
):
if
not
self
.
dataset
and
not
silent
:
print
(
"Warning: Dataset not built, couldn't flush"
)
...
...
scripts/benchmarkdataset.py
0 → 100644
View file @
91470a5a
from
basedformer.dataset
import
ShardedImageDataset
import
time
import
random
dataset_folder
=
"/home/xuser/nvme1/dataset/danbooru_updated_page/"
dataset
=
ShardedImageDataset
(
dataset_folder
+
"danbooru_updated.ds"
,
dataset_folder
+
"danbooru_updated.index"
,
None
,
bsz
=
1
)
#read through first to normalize times / cache
total_size
=
0
for
key
in
dataset
.
pointer_lookup
.
keys
():
offset
,
size
=
dataset
.
pointer_lookup
[
key
]
data
=
list
(
dataset
.
mmap
[
offset
:
offset
+
size
])
# convert to list to ensure data is actually read
total_size
=
total_size
+
size
#benchmark sequential read
start_time
=
time
.
time
()
for
key
in
dataset
.
pointer_lookup
.
keys
():
offset
,
size
=
dataset
.
pointer_lookup
[
key
]
data
=
list
(
dataset
.
mmap
[
offset
:
offset
+
size
])
# convert to list to ensure data is actually read
end_time
=
time
.
time
()
print
(
"Sequential read took "
+
str
(
end_time
-
start_time
)
+
" seconds to read "
+
str
(
total_size
//
1024
//
1024
)
+
"MiB"
)
#benchmark random read
keys
=
list
(
dataset
.
pointer_lookup
.
keys
())
random
.
shuffle
(
keys
)
start_time
=
time
.
time
()
for
key
in
keys
:
offset
,
size
=
dataset
.
pointer_lookup
[
key
]
data
=
list
(
dataset
.
mmap
[
offset
:
offset
+
size
])
# convert to list to ensure data is actually read
end_time
=
time
.
time
()
print
(
"Random read took "
+
str
(
end_time
-
start_time
)
+
" seconds to read "
+
str
(
total_size
//
1024
//
1024
)
+
"MiB"
)
\ No newline at end of file
scripts/scrapedanbooru.py
0 → 100644
View file @
91470a5a
# todo: arg parsing maybe
metadata_folder
=
"../danbooru_metadata/danbooru_metadata_new/"
existing_dataset
=
"/home/xuser/nvme1/dataset/danbooru/"
new_dataset_folder
=
"/home/xuser/nvme1/dataset/danbooru_updated/"
from
tqdm
import
tqdm
import
os
import
json
from
codecs
import
decode
from
basedformer.dataset
import
ShardedImageDataset
,
ImageDatasetBuilder
all_metadata
=
{}
for
f
in
tqdm
(
os
.
listdir
(
metadata_folder
)):
try
:
metadata
=
json
.
loads
(
decode
(
open
(
metadata_folder
+
f
,
'rb'
)
.
read
(),
'zlib'
)
.
decode
(
"UTF-8"
))
for
post
in
metadata
:
if
"id"
in
post
:
all_metadata
[
post
[
"id"
]]
=
post
except
:
print
(
"Error parsing "
+
f
)
print
(
"Loading old dataset..."
)
old_dataset
=
ShardedImageDataset
(
existing_dataset
+
"danbooru.ds"
,
existing_dataset
+
"danbooru.index"
,
existing_dataset
+
"danbooru.metadata"
)
old_meta
=
old_dataset
.
metadata
old_lookup
=
old_dataset
.
pointer_lookup
print
(
"Calculating entries that can be copied from existing db..."
)
bad_types
=
[
'jpg'
,
'jpeg'
]
can_keep
=
{}
can_keep_list
=
[]
for
k
in
all_metadata
.
keys
():
data
=
all_metadata
[
k
]
if
data
[
'file_ext'
]
not
in
bad_types
:
if
data
[
'id'
]
in
old_meta
and
data
[
'id'
]
in
old_lookup
:
#can_keep.append(data['id'])
can_keep
[
data
[
'id'
]]
=
True
can_keep_list
.
append
(
data
[
'id'
])
print
(
"can keep
%
d of
%
d with new db of
%
d"
%
(
len
(
can_keep
),
len
(
old_meta
),
len
(
all_metadata
)))
#estimate new dataset size
total_new_bytes
=
(
os
.
path
.
getsize
(
existing_dataset
+
"danbooru.ds"
)
*
(
len
(
can_keep
)
/
len
(
old_meta
)))
total_new_jpegs
=
0
for
k
in
all_metadata
.
keys
():
if
k
not
in
can_keep
:
data
=
all_metadata
[
k
]
if
data
[
'file_ext'
]
in
bad_types
:
total_new_bytes
=
total_new_bytes
+
data
[
'file_size'
]
total_new_jpegs
=
total_new_jpegs
+
1
total_new_bytes
=
total_new_bytes
+
int
((
os
.
path
.
getsize
(
existing_dataset
+
"danbooru.ds"
)
/
len
(
old_meta
))
*
(
len
(
all_metadata
)
-
(
len
(
can_keep
)
+
total_new_jpegs
)))
print
(
"Estimated new dataset size:
%
dGiB"
%
(((
total_new_bytes
/
1024
)
/
1024
)
/
1024
))
print
(
"Copyng old db data..."
)
# detect block size of fs the archive is stored on
block_size
=
int
(
os
.
popen
(
"getconf PAGE_SIZE"
)
.
read
()
.
lstrip
()
.
rstrip
())
#int(os.popen("stat -fc %s " + new_dataset_folder).read().lstrip().rstrip())
new_dataset
=
ImageDatasetBuilder
(
folder_path
=
new_dataset_folder
,
name
=
"danbooru_updated"
,
threads
=
32
,
block_size
=
block_size
,
align_fs_blocks
=
True
)
new_dataset
.
build
()
# how many operations to run at once
copy_chunk_size
=
4096
for
e
in
tqdm
(
range
(
0
,
len
(
can_keep_list
),
copy_chunk_size
)):
chunk
=
can_keep_list
[
e
:
e
+
copy_chunk_size
]
new_dataset
.
operate
(
lambda
id
:
old_dataset
.
read_from_id
(
id
,
decode
=
False
),
chunk
,
chunk
,
use_tqdm
=
True
)
new_dataset
.
flush
()
new_dataset
.
flush_index
()
new_dataset
.
flush_metadata
()
print
(
"Scraping..."
)
to_scrape
=
[]
for
k
in
all_metadata
.
keys
():
if
k
not
in
can_keep
:
to_scrape
.
append
(
k
)
def
download_danbooru
(
id
):
meta
=
all_metadata
[
id
]
return
new_dataset
.
url_op
(
meta
[
"large_file_url"
]
if
meta
[
"has_large"
]
else
meta
[
"file_url"
],
meta
[
"md5"
])
save_every
=
25
for
e
in
tqdm
(
range
(
0
,
len
(
to_scrape
),
copy_chunk_size
)):
chunk
=
to_scrape
[
e
:
e
+
copy_chunk_size
]
new_dataset
.
operate
(
download_danbooru
,
chunk
,
chunk
,
use_tqdm
=
True
)
if
(
e
//
copy_chunk_size
)
%
save_every
==
0
:
new_dataset
.
flush
()
new_dataset
.
flush_index
()
new_dataset
.
flush_metadata
()
new_dataset
.
flush
()
new_dataset
.
flush_index
()
new_dataset
.
flush_metadata
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment