mirror of
https://github.com/rd-stuffs/msm-4.14.git
synced 2025-02-20 11:45:48 +08:00
Merge remote-tracking branch 'google/upstream-f2fs-stable-linux-4.14.y' into sheesh
* google/upstream-f2fs-stable-linux-4.14.y: f2fs: fix to do sanity check on .cp_pack_total_block_count f2fs: make gc_urgent and gc_segment_mode sysfs node readable f2fs: use aggressive GC policy during f2fs_disable_checkpoint() f2fs: fix compressed file start atomic write may cause data corruption f2fs: initialize sbi->gc_mode explicitly f2fs: introduce gc_urgent_mid mode f2fs: compress: fix to print raw data size in error path of lz4 decompression f2fs: remove redundant parameter judgment f2fs: use spin_lock to avoid hang f2fs: don't get FREEZE lock in f2fs_evict_inode in frozen fs f2fs: remove unnecessary read for F2FS_FITS_IN_INODE f2fs: fix to do sanity check on curseg->alloc_type f2fs: fix to avoid potential deadlock f2fs: quota: fix loop condition at f2fs_quota_sync() f2fs: Restore rwsem lockdep support f2fs: fix missing free nid in f2fs_handle_failed_inode f2fs: add a way to limit roll forward recovery time f2fs: introduce F2FS_IPU_HONOR_OPU_WRITE ipu policy f2fs: adjust readahead block number during recovery f2fs: fix to unlock page correctly in error path of is_alive() f2fs: expose discard related parameters in sysfs f2fs: move discard parameters into discard_cmd_control f2fs: fix to enable ATGC correctly via gc_idle sysfs interface f2fs: move f2fs to use reader-unfair rwsems f2fs: do not allow partial truncation on pinned file f2fs: remove redunant invalidate compress pages f2fs: Simplify bool conversion f2fs: don't drop compressed page cache in .{invalidate,release}page f2fs: fix to reserve space for IO align feature f2fs: fix to check available space of CP area correctly in update_ckpt_flags() f2fs: support fault injection to f2fs_trylock_op() f2fs: clean up __find_inline_xattr() with __find_xattr() f2fs: fix to do sanity check on last xattr entry in __f2fs_setxattr() f2fs: do not bother checkpoint by f2fs_get_node_info f2fs: avoid down_write on nat_tree_lock during checkpoint f2fs: compress: fix potential deadlock of compress file f2fs: avoid EINVAL by SBI_NEED_FSCK when pinning a file f2fs: add gc_urgent_high_remaining sysfs node f2fs: fix to do sanity check in is_alive() f2fs: fix to avoid panic in is_alive() if metadata is inconsistent f2fs: fix to do sanity check on inode type during garbage collection f2fs: avoid duplicate call of mark_inode_dirty f2fs: fix remove page failed in invalidate compress pages f2fs: fix the f2fs_file_write_iter tracepoint f2fs: do not expose unwritten blocks to user by DIO f2fs: reduce indentation in f2fs_file_write_iter() f2fs: rework write preallocations f2fs: compress: reduce one page array alloc and free when write compressed page f2fs: show number of pending discard commands f2fs: check nr_pages for readahead f2fs: fix UAF in f2fs_available_free_memory f2fs: invalidate META_MAPPING before IPU/DIO write f2fs: support fault injection for dquot_initialize() f2fs: fix incorrect return value in f2fs_sanity_check_ckpt() f2fs: compress: disallow disabling compress on non-empty compressed file f2fs: compress: fix overwrite may reduce compress ratio unproperly f2fs: multidevice: support direct IO f2fs: introduce fragment allocation mode mount option f2fs: include non-compressed blocks in compr_written_block f2fs: fix wrong condition to trigger background checkpoint correctly f2fs: fix to use WHINT_MODE f2fs: fix up f2fs_lookup tracepoints f2fs: set SBI_NEED_FSCK flag when inconsistent node block found f2fs: introduce excess_dirty_threshold() f2fs: avoid attaching SB_ACTIVE flag during mount f2fs: quota: fix potential deadlock f2fs: should use GFP_NOFS for directory inodes f2fs: should put a page beyond EOF when preparing a write f2fs: deallocate compressed pages when error happens f2fs: enable realtime discard iff device supports discard f2fs: guarantee to write dirty data when enabling checkpoint back f2fs: fix to unmap pages from userspace process in punch_hole() f2fs: fix unexpected ENOENT comes from f2fs_map_blocks() f2fs: fix to account missing .skipped_gc_rwsem f2fs: adjust unlock order for cleanup f2fs: Don't create discard thread when device doesn't support realtime discard f2fs: rebuild nat_bits during umount f2fs: introduce periodic iostat io latency traces f2fs: separate out iostat feature f2fs: compress: do sanity check on cluster f2fs: fix description about main_blkaddr node f2fs: convert S_IRUGO to 0444 f2fs: fix to keep compatibility of fault injection interface f2fs: support fault injection for f2fs_kmem_cache_alloc() f2fs: compress: allow write compress released file after truncate to zero f2fs: correct comment in segment.h f2fs: improve sbi status info in debugfs/f2fs/status f2fs: compress: avoid duplicate counting of valid blocks when read compressed file f2fs: fix to do sanity check for sb/cp fields correctly f2fs: avoid unneeded memory allocation in __add_ino_entry() f2fs: extent cache: support unaligned extent f2fs: Kconfig: clean up config options about compression f2fs: reduce the scope of setting fsck tag when de->name_len is zero f2fs: fix to stop filesystem update once CP failed f2fs: introduce discard_unit mount option f2fs: fix min_seq_blocks can not make sense in some scenes. f2fs: fix to force keeping write barrier for strict fsync mode f2fs: fix wrong checkpoint_changed value in f2fs_remount() f2fs: show sbi status in debugfs/f2fs/status f2fs: turn back remapped address in compressed page endio f2fs: change fiemap way in printing compression chunk f2fs: do not submit NEW_ADDR to read node block f2fs: compress: remove unneeded read when rewrite whole cluster f2fs: don't sleep while grabing nat_tree_lock f2fs: remove allow_outplace_dio() f2fs: make f2fs_write_failed() take struct inode f2fs: quota: fix potential deadlock f2fs: let's keep writing IOs on SBI_NEED_FSCK f2fs: Revert "f2fs: Fix indefinite loop in f2fs_gc() v1" f2fs: avoid to create an empty string as the extension_list f2fs: compress: fix to set zstd compress level correctly f2fs: add sysfs nodes to get GC info for each GC mode f2fs: drop dirty node pages when cp is in error status f2fs: initialize page->private when using for our internal use f2fs: compress: add nocompress extensions support Revert "f2fs: avoid attaching SB_ACTIVE flag during mount/remount" f2fs: remove false alarm on iget failure during GC f2fs: enable extent cache for compression files in read-only f2fs: fix to avoid adding tab before doc section f2fs: introduce f2fs_casefolded_name slab cache f2fs: swap: support migrating swapfile in aligned write mode f2fs: swap: remove dead codes f2fs: compress: add compress_inode to cache compressed blocks f2fs: clean up /sys/fs/f2fs/<disk>/features f2fs: add pin_file in feature list f2fs: Advertise encrypted casefolding in sysfs f2fs: Show casefolding support only when supported f2fs: support RO feature f2fs: logging neatening f2fs: restructure f2fs page.private layout f2fs: introduce FI_COMPRESS_RELEASED instead of using IMMUTABLE bit f2fs: compress: remove unneeded preallocation f2fs: avoid attaching SB_ACTIVE flag during mount/remount f2fs: atgc: export entries for better tunability via sysfs f2fs: compress: fix to disallow temp extension f2fs: let's allow compression for mmap files f2fs: add MODULE_SOFTDEP to ensure crc32 is included in the initramfs f2fs: return success if there is no work to do f2fs: compress: clean up parameter of __f2fs_cluster_blocks() f2fs: compress: remove unneeded f2fs_put_dnode() f2fs: atgc: fix to set default age threshold f2fs: Prevent swap file in LFS mode f2fs: fix to avoid racing on fsync_entry_slab by multi filesystem instances f2fs: add cp_error check in f2fs_write_compressed_pages f2fs: compress: rename __cluster_may_compress f2fs: return EINVAL for hole cases in swap file f2fs: avoid swapon failure by giving a warning first f2fs: compress: fix to assign cc.cluster_idx correctly f2fs: compress: fix race condition of overwrite vs truncate f2fs: compress: fix to free compress page correctly f2fs: support iflag change given the mask f2fs: avoid null pointer access when handling IPU error f2fs: drop inplace IO if fs status is abnormal f2fs: compress: remove unneed check condition f2fs: clean up left deprecated IO trace codes f2fs: avoid using native allocate_segment_by_default() f2fs: remove unnecessary struct declaration f2fs: fix to avoid NULL pointer dereference f2fs: avoid duplicated codes for cleanup f2fs: document: add description about compressed space handling f2fs: clean up build warnings f2fs: fix the periodic wakeups of discard thread f2fs: fix to avoid accessing invalid fio in f2fs_allocate_data_block() f2fs: fix to avoid GC/mmap race with f2fs_truncate() f2fs: set checkpoint_merge by default f2fs: Fix a hungtask problem in atomic write f2fs: fix to restrict mount condition on readonly block device f2fs: introduce gc_merge mount option f2fs: fix to cover __allocate_new_section() with curseg_lock f2fs: fix wrong alloc_type in f2fs_do_replace_block f2fs: delete empty compress.h f2fs: fix a typo in inode.c f2fs: allow to change discard policy based on cached discard cmds f2fs: fix to avoid touching checkpointed data in get_victim() f2fs: fix to update last i_size if fallocate partially succeeds f2fs: fix error path of f2fs_remount() f2fs: fix wrong comment of nat_tree_lock f2fs: fix to avoid out-of-bounds memory access f2fs: don't start checkpoint thread in readonly mountpoint f2fs: do not use AT_SSR mode in FG_GC & high urgent BG_GC f2fs: add sysfs nodes to get runtime compression stat f2fs: fix to use per-inode maxbytes in f2fs_fiemap f2fs: fix to align to section for fallocate() on pinned file f2fs: expose # of overprivision segments f2fs: fix error handling in f2fs_end_enable_verity() f2fs: fix a redundant call to f2fs_balance_fs if an error occurs f2fs: remove unused file_clear_encrypt() f2fs: check if swapfile is section-alligned f2fs: fix last_lblock check in check_swap_activate_fast f2fs: remove unnecessary IS_SWAPFILE check f2fs: Replace one-element array with flexible-array member f2fs: compress: Allow modular (de)compression algorithms f2fs: check discard command number before traversing discard pending list f2fs: update comments for explicit memory barrier f2fs: remove unused FORCE_FG_GC macro f2fs: avoid unused f2fs_show_compress_options() f2fs: fix panic during f2fs_resize_fs() f2fs: fix to allow migrating fully valid segment f2fs: fix a spelling error f2fs: fix a spacing coding style fs: Enable bmap() function to properly return errors f2fs: remove obsolete f2fs.txt fs-verity: support reading signature with ioctl fs-verity: support reading descriptor with ioctl fs-verity: support reading Merkle tree with ioctl fs-verity: add FS_IOC_READ_VERITY_METADATA ioctl fs-verity: don't pass whole descriptor to fsverity_verify_signature() fs-verity: factor out fsverity_get_descriptor() fs-verity: move structs needed for file signing to UAPI header fs-verity: rename "file measurement" to "file digest" fs-verity: rename fsverity_signed_digest to fsverity_formatted_digest fs-verity: remove filenames from file comments fs-verity: use smp_load_acquire() for ->i_verity_info f2fs: remove FAULT_ALLOC_BIO f2fs: use blkdev_issue_flush in __submit_flush_wait f2fs: remove a few bd_part checks quota: Cleanup list iteration in dqcache_shrink_scan() quota: reclaim least recently used dquots fs: quota: Replace GFP_ATOMIC with GFP_KERNEL in dquot_init quota: Check for register_shrinker() failure. quota: propagate error from __dquot_initialize quota: be aware of error from dquot_initialize Documentation: f2fs: fix typo s/automaic/automatic f2fs: give a warning only for readonly partition f2fs: don't grab superblock freeze for flush/ckpt thread f2fs: add ckpt_thread_ioprio sysfs node f2fs: introduce checkpoint_merge mount option f2fs: relocate inline conversion from mmap() to mkwrite() f2fs: fix a wrong condition in __submit_bio f2fs: remove unnecessary initialization in xattr.c f2fs: fix to avoid inconsistent quota data f2fs: flush data when enabling checkpoint back f2fs: deprecate f2fs_trace_io f2fs: remove unused stat_{inc, dec}_atomic_write f2fs: introduce sb_status sysfs node f2fs: fix to use per-inode maxbytes f2fs: compress: fix potential deadlock libfs: unexport generic_ci_d_compare() and generic_ci_d_hash() f2fs: fix to set/clear I_LINKABLE under i_lock f2fs: fix null page reference in redirty_blocks f2fs: clean up post-read processing f2fs: trival cleanup in move_data_block() f2fs: fix out-of-repair __setattr_copy() f2fs: fix to tag FIEMAP_EXTENT_MERGED in f2fs_fiemap() f2fs: introduce a new per-sb directory in sysfs f2fs: compress: support compress level f2fs: compress: deny setting unsupported compress algorithm f2fs: relocate f2fs_precache_extents() f2fs: enforce the immutable flag on open files f2fs: enhance to update i_mode and acl atomically in f2fs_setattr() f2fs: fix to set inode->i_mode correctly for posix_acl_update_mode f2fs: Replace expression with offsetof() f2fs: handle unallocated section and zone on pinned/atgc f2fs: compress: fix compression chksum f2fs: fix shift-out-of-bounds in sanity_check_raw_super() f2fs: fix race of pending_pages in decompression f2fs: fix to account inline xattr correctly during recovery f2fs: inline: fix wrong inline inode stat f2fs: inline: correct comment in f2fs_recover_inline_data f2fs: don't check PAGE_SIZE again in sanity_check_raw_super() f2fs: convert to F2FS_*_INO macro f2fs: introduce max_io_bytes, a sysfs entry, to limit bio size f2fs: don't allow any writes on readonly mount f2fs: avoid race condition for shrinker count f2fs: add F2FS_IOC_DECOMPRESS_FILE and F2FS_IOC_COMPRESS_FILE f2fs: add compress_mode mount option f2fs: Remove unnecessary unlikely() f2fs: init dirty_secmap incorrectly f2fs: remove buffer_head which has 32bits limit f2fs: fix wrong block count instead of bytes f2fs: use new conversion functions between blks and bytes f2fs: rename logical_to_blk and blk_to_logical f2fs: fix kbytes written stat for multi-device case f2fs: compress: support chksum f2fs: fix to avoid REQ_TIME and CP_TIME collision f2fs: change to use rwsem for cp_mutex f2fs: Handle casefolding with Encryption fscrypt: Have filesystems handle their d_ops libfs: Add generic function for setting dentry_ops f2fs: Remove the redundancy initialization f2fs: remove writeback_inodes_sb in f2fs_remount f2fs: fix double free of unicode map f2fs: fix compat F2FS_IOC_{MOVE,GARBAGE_COLLECT}_RANGE f2fs: avoid unneeded data copy in f2fs_ioc_move_range() f2fs: add F2FS_IOC_SET_COMPRESS_OPTION ioctl f2fs: add F2FS_IOC_GET_COMPRESS_OPTION ioctl f2fs: move ioctl interface definitions to separated file f2fs: fix to seek incorrect data offset in inline data file f2fs: check fiemap parameters f2fs: call f2fs_get_meta_page_retry for nat page fscrypt: rename DCACHE_ENCRYPTED_NAME to DCACHE_NOKEY_NAME fscrypt: don't call no-key names "ciphertext names" fscrypt: export fscrypt_d_revalidate() f2fs: code cleanup by removing unnecessary check f2fs: wait for sysfs kobject removal before freeing f2fs_sb_info f2fs: fix writecount false positive in releasing compress blocks f2fs: introduce check_swap_activate_fast() f2fs: don't issue flush in f2fs_flush_device_cache() for nobarrier case f2fs: handle errors of f2fs_get_meta_page_nofail f2fs: fix to set SBI_NEED_FSCK flag for inconsistent inode f2fs: reject CASEFOLD inode flag without casefold feature f2fs: fix memory alignment to support 32bit f2fs: fix slab leak of rpages pointer f2fs: compress: fix to disallow enabling compress on non-empty file f2fs: compress: introduce cic/dic slab cache f2fs: compress: introduce page array slab cache f2fs: fix to do sanity check on segment/section count f2fs: fix to check segment boundary during SIT page readahead f2fs: fix uninit-value in f2fs_lookup fs/buffer.c: record blockdev write errors in super_block that it backs vfs: track per-sb writeback errors and report them to syncfs f2fs: remove unneeded parameter in find_in_block() f2fs: fix wrong total_sections check and fsmeta check f2fs: remove duplicated code in sanity_check_area_boundary f2fs: remove unused check on version_bitmap f2fs: relocate blkzoned feature check f2fs: do sanity check on zoned block device path f2fs: add trace exit in exception path f2fs: change return value of reserved_segments to unsigned int f2fs: clean up kvfree f2fs: change virtual mapping way for compression pages f2fs: change return value of f2fs_disable_compressed_file to bool f2fs: change i_compr_blocks of inode to atomic value f2fs: ignore compress mount option on image w/o compression feature f2fs: allocate proper size memory for zstd decompress f2fs: change compr_blocks of superblock info to 64bit f2fs: add block address limit check to compressed file f2fs: check position in move range ioctl f2fs: correct statistic of APP_DIRECT_IO/APP_DIRECT_READ_IO f2fs: support age threshold based garbage collection f2fs: Use generic casefolding support fs: Add standard casefolding support unicode: Add utf8_casefold_hash f2fs: compress: use more readable atomic_t type for {cic,dic}.ref f2fs: fix compile warning f2fs: support 64-bits key in f2fs rb-tree node entry f2fs: inherit mtime of original block during GC f2fs: record average update time of segment f2fs: introduce inmem curseg f2fs: compress: remove unneeded code f2fs: remove duplicated type casting f2fs: support zone capacity less than zone size f2fs: update changes in upstream on GC_URGENT_HIGH f2fs: Return EOF on unaligned end of file DIO read f2fs: fix indefinite loop scanning for free nid f2fs: Fix type of section block count variables f2fs: prepare a waiter before entering io_schedule f2fs: update_sit_entry: Make the judgment condition of f2fs_bug_on more intuitive f2fs: replace test_and_set/clear_bit() with set/clear_bit() f2fs: make file immutable even if releasing zero compression block f2fs: compress: disable compression mount option if compression is off f2fs: compress: add sanity check during compressed cluster read f2fs: use macro instead of f2fs verity version f2fs: fix deadlock between quota writes and checkpoint f2fs: correct comment of f2fs_exist_written_data f2fs: compress: delay temp page allocation f2fs: compress: fix to update isize when overwriting compressed file f2fs: space related cleanup f2fs: fix use-after-free issue f2fs: Change the type of f2fs_flush_inline_data() to void f2fs: add F2FS_IOC_SEC_TRIM_FILE ioctl f2fs: segment.h: delete a duplicated word f2fs: compress: fix to avoid memory leak on cc->cpages f2fs: use generic names for generic ioctls f2fs: don't keep meta inode pages used for compressed block migration f2fs: fix error path in do_recover_data() f2fs: fix to wait GCed compressed page writeback f2fs: remove write attribute of main_blkaddr sysfs node f2fs: add GC_URGENT_LOW mode in gc_urgent f2fs: avoid readahead race condition f2fs: fix return value of move_data_block() f2fs: add parameter op_flag in f2fs_submit_page_read() f2fs: split f2fs_allocate_new_segments() f2fs: lost matching-pair of trace in f2fs_truncate_inode_blocks f2fs: fix an oops in f2fs_is_compressed_page f2fs: make trace enter and end in pairs for unlink f2fs: fix to check page dirty status before writeback f2fs: remove the unused compr parameter f2fs: support to trace f2fs_fiemap() f2fs: support to trace f2fs_bmap() f2fs: fix wrong return value of f2fs_bmap_compress() f2fs: remove useless parameter of __insert_free_nid() f2fs: fix typo in comment of f2fs_do_add_link f2fs: fix to wait page writeback before update f2fs: show more debug info for per-temperature log f2fs: add f2fs_gc exception handle in f2fs_ioc_gc_range f2fs: clean up parameter of f2fs_allocate_data_block() f2fs: shrink node_write lock coverage f2fs: add prefix for exported symbols f2fs: use kfree() to free variables allocated by match_strdup() f2fs: get the right gc victim section when section has several segments f2fs: fix a race condition between f2fs_write_end_io and f2fs_del_fsync_node_entry f2fs: remove useless truncate in f2fs_collapse_range() f2fs: use kfree() instead of kvfree() to free superblock data f2fs: avoid checkpatch error f2fs: should avoid inode eviction in synchronous path
This commit is contained in:
commit
e9e83ae080
@ -22,7 +22,8 @@ Contact: "Namjae Jeon" <namjae.jeon@samsung.com>
|
||||
Description: Controls the victim selection policy for garbage collection.
|
||||
Setting gc_idle = 0(default) will disable this option. Setting
|
||||
gc_idle = 1 will select the Cost Benefit approach & setting
|
||||
gc_idle = 2 will select the greedy approach.
|
||||
gc_idle = 2 will select the greedy approach & setting
|
||||
gc_idle = 3 will select the age-threshold based approach.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/reclaim_segments
|
||||
Date: October 2013
|
||||
@ -37,18 +38,25 @@ Description: This parameter controls the number of prefree segments to be
|
||||
What: /sys/fs/f2fs/<disk>/main_blkaddr
|
||||
Date: November 2019
|
||||
Contact: "Ramon Pantin" <pantin@google.com>
|
||||
Description:
|
||||
Shows first block address of MAIN area.
|
||||
Description: Shows first block address of MAIN area.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/ipu_policy
|
||||
Date: November 2013
|
||||
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||
Description: Controls the in-place-update policy.
|
||||
updates in f2fs. User can set:
|
||||
0x01: F2FS_IPU_FORCE, 0x02: F2FS_IPU_SSR,
|
||||
0x04: F2FS_IPU_UTIL, 0x08: F2FS_IPU_SSR_UTIL,
|
||||
0x10: F2FS_IPU_FSYNC, 0x20: F2FS_IPU_ASYNC,
|
||||
0x40: F2FS_IPU_NOCACHE.
|
||||
|
||||
==== =================
|
||||
0x01 F2FS_IPU_FORCE
|
||||
0x02 F2FS_IPU_SSR
|
||||
0x04 F2FS_IPU_UTIL
|
||||
0x08 F2FS_IPU_SSR_UTIL
|
||||
0x10 F2FS_IPU_FSYNC
|
||||
0x20 F2FS_IPU_ASYNC
|
||||
0x40 F2FS_IPU_NOCACHE
|
||||
0x80 F2FS_IPU_HONOR_OPU_WRITE
|
||||
==== =================
|
||||
|
||||
Refer segment.h for details.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/min_ipu_util
|
||||
@ -88,6 +96,33 @@ Description: Controls the issue rate of discard commands that consist of small
|
||||
checkpoint is triggered, and issued during the checkpoint.
|
||||
By default, it is disabled with 0.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/max_discard_request
|
||||
Date: December 2021
|
||||
Contact: "Konstantin Vyshetsky" <vkon@google.com>
|
||||
Description: Controls the number of discards a thread will issue at a time.
|
||||
Higher number will allow the discard thread to finish its work
|
||||
faster, at the cost of higher latency for incomming I/O.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/min_discard_issue_time
|
||||
Date: December 2021
|
||||
Contact: "Konstantin Vyshetsky" <vkon@google.com>
|
||||
Description: Controls the interval the discard thread will wait between
|
||||
issuing discard requests when there are discards to be issued and
|
||||
no I/O aware interruptions occur.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/mid_discard_issue_time
|
||||
Date: December 2021
|
||||
Contact: "Konstantin Vyshetsky" <vkon@google.com>
|
||||
Description: Controls the interval the discard thread will wait between
|
||||
issuing discard requests when there are discards to be issued and
|
||||
an I/O aware interruption occurs.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/max_discard_issue_time
|
||||
Date: December 2021
|
||||
Contact: "Konstantin Vyshetsky" <vkon@google.com>
|
||||
Description: Controls the interval the discard thread will wait when there are
|
||||
no discard operations to be issued.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/discard_granularity
|
||||
Date: July 2017
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
@ -102,6 +137,11 @@ Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description: Set timeout to issue discard commands during umount.
|
||||
Default: 5 secs
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/pending_discard
|
||||
Date: November 2021
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description: Shows the number of pending discard commands in the queue.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/max_victim_search
|
||||
Date: January 2014
|
||||
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||
@ -192,7 +232,34 @@ Description: Shows total written kbytes issued to disk.
|
||||
What: /sys/fs/f2fs/<disk>/feature
|
||||
Date: July 2017
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description: Shows all enabled features in current device.
|
||||
Description: <deprecated: should use /sys/fs/f2fs/<disk>/feature_list/
|
||||
Shows all enabled features in current device.
|
||||
Supported features:
|
||||
encryption, blkzoned, extra_attr, projquota, inode_checksum,
|
||||
flexible_inline_xattr, quota_ino, inode_crtime, lost_found,
|
||||
verity, sb_checksum, casefold, readonly, compression, pin_file.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/feature_list/
|
||||
Date: June 2021
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description: Expand /sys/fs/f2fs/<disk>/features to meet sysfs rule.
|
||||
Supported on-disk features:
|
||||
encryption, block_zoned (aka blkzoned), extra_attr,
|
||||
project_quota (aka projquota), inode_checksum,
|
||||
flexible_inline_xattr, quota_ino, inode_crtime, lost_found,
|
||||
verity, sb_checksum, casefold, readonly, compression.
|
||||
Note that, pin_file is moved into /sys/fs/f2fs/features/.
|
||||
|
||||
What: /sys/fs/f2fs/features/
|
||||
Date: July 2017
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description: Shows all enabled kernel features.
|
||||
Supported features:
|
||||
encryption, block_zoned, extra_attr, project_quota,
|
||||
inode_checksum, flexible_inline_xattr, quota_ino,
|
||||
inode_crtime, lost_found, verity, sb_checksum,
|
||||
casefold, readonly, compression, test_dummy_encryption_v2,
|
||||
atomic_write, pin_file, encrypted_casefold.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/inject_rate
|
||||
Date: May 2016
|
||||
@ -227,9 +294,16 @@ Description: Shows current reserved blocks in system, it may be temporarily
|
||||
What: /sys/fs/f2fs/<disk>/gc_urgent
|
||||
Date: August 2017
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description: Do background GC agressively when set. When gc_urgent = 1,
|
||||
background thread starts to do GC by given gc_urgent_sleep_time
|
||||
interval. It is set to 0 by default.
|
||||
Description: Do background GC aggressively when set. Set to 0 by default.
|
||||
gc urgent high(1): does GC forcibly in a period of given
|
||||
gc_urgent_sleep_time and ignores I/O idling check. uses greedy
|
||||
GC approach and turns SSR mode on.
|
||||
gc urgent low(2): lowers the bar of checking I/O idling in
|
||||
order to process outstanding discard commands and GC a
|
||||
little bit aggressively. uses cost benefit GC approach.
|
||||
gc urgent mid(3): does GC forcibly in a period of given
|
||||
gc_urgent_sleep_time and executes a mid level of I/O idling check.
|
||||
uses cost benefit GC approach.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/gc_urgent_sleep_time
|
||||
Date: August 2017
|
||||
@ -263,7 +337,7 @@ Date April 2019
|
||||
Contact: "Daniel Rosenberg" <drosen@google.com>
|
||||
Description: If checkpoint=disable, it displays the number of blocks that
|
||||
are unusable.
|
||||
If checkpoint=enable it displays the enumber of blocks that
|
||||
If checkpoint=enable it displays the number of blocks that
|
||||
would be unusable if checkpoint=disable were to be set.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/encoding
|
||||
@ -347,3 +421,143 @@ Date: April 2020
|
||||
Contact: "Daeho Jeong" <daehojeong@google.com>
|
||||
Description: Give a way to change iostat_period time. 3secs by default.
|
||||
The new iostat trace gives stats gap given the period.
|
||||
What: /sys/fs/f2fs/<disk>/max_io_bytes
|
||||
Date: December 2020
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description: This gives a control to limit the bio size in f2fs.
|
||||
Default is zero, which will follow underlying block layer limit,
|
||||
whereas, if it has a certain bytes value, f2fs won't submit a
|
||||
bio larger than that size.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/stat/sb_status
|
||||
Date: December 2020
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Description: Show status of f2fs superblock in real time.
|
||||
|
||||
====== ===================== =================================
|
||||
value sb status macro description
|
||||
0x1 SBI_IS_DIRTY dirty flag for checkpoint
|
||||
0x2 SBI_IS_CLOSE specify unmounting
|
||||
0x4 SBI_NEED_FSCK need fsck.f2fs to fix
|
||||
0x8 SBI_POR_DOING recovery is doing or not
|
||||
0x10 SBI_NEED_SB_WRITE need to recover superblock
|
||||
0x20 SBI_NEED_CP need to checkpoint
|
||||
0x40 SBI_IS_SHUTDOWN shutdown by ioctl
|
||||
0x80 SBI_IS_RECOVERED recovered orphan/data
|
||||
0x100 SBI_CP_DISABLED CP was disabled last mount
|
||||
0x200 SBI_CP_DISABLED_QUICK CP was disabled quickly
|
||||
0x400 SBI_QUOTA_NEED_FLUSH need to flush quota info in CP
|
||||
0x800 SBI_QUOTA_SKIP_FLUSH skip flushing quota in current CP
|
||||
0x1000 SBI_QUOTA_NEED_REPAIR quota file may be corrupted
|
||||
0x2000 SBI_IS_RESIZEFS resizefs is in process
|
||||
0x4000 SBI_IS_FREEZING freefs is in process
|
||||
====== ===================== =================================
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/ckpt_thread_ioprio
|
||||
Date: January 2021
|
||||
Contact: "Daeho Jeong" <daehojeong@google.com>
|
||||
Description: Give a way to change checkpoint merge daemon's io priority.
|
||||
Its default value is "be,3", which means "BE" I/O class and
|
||||
I/O priority "3". We can select the class between "rt" and "be",
|
||||
and set the I/O priority within valid range of it. "," delimiter
|
||||
is necessary in between I/O class and priority number.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/ovp_segments
|
||||
Date: March 2021
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description: Shows the number of overprovision segments.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/compr_written_block
|
||||
Date: March 2021
|
||||
Contact: "Daeho Jeong" <daehojeong@google.com>
|
||||
Description: Show the block count written after compression since mount. Note
|
||||
that when the compressed blocks are deleted, this count doesn't
|
||||
decrease. If you write "0" here, you can initialize
|
||||
compr_written_block and compr_saved_block to "0".
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/compr_saved_block
|
||||
Date: March 2021
|
||||
Contact: "Daeho Jeong" <daehojeong@google.com>
|
||||
Description: Show the saved block count with compression since mount. Note
|
||||
that when the compressed blocks are deleted, this count doesn't
|
||||
decrease. If you write "0" here, you can initialize
|
||||
compr_written_block and compr_saved_block to "0".
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/compr_new_inode
|
||||
Date: March 2021
|
||||
Contact: "Daeho Jeong" <daehojeong@google.com>
|
||||
Description: Show the count of inode newly enabled for compression since mount.
|
||||
Note that when the compression is disabled for the files, this count
|
||||
doesn't decrease. If you write "0" here, you can initialize
|
||||
compr_new_inode to "0".
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/atgc_candidate_ratio
|
||||
Date: May 2021
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Description: When ATGC is on, it controls candidate ratio in order to limit total
|
||||
number of potential victim in all candidates, the value should be in
|
||||
range of [0, 100], by default it was initialized as 20(%).
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/atgc_candidate_count
|
||||
Date: May 2021
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Description: When ATGC is on, it controls candidate count in order to limit total
|
||||
number of potential victim in all candidates, by default it was
|
||||
initialized as 10 (sections).
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/atgc_age_weight
|
||||
Date: May 2021
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Description: When ATGC is on, it controls age weight to balance weight proportion
|
||||
in between aging and valid blocks, the value should be in range of
|
||||
[0, 100], by default it was initialized as 60(%).
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/atgc_age_threshold
|
||||
Date: May 2021
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Description: When ATGC is on, it controls age threshold to bypass GCing young
|
||||
candidates whose age is not beyond the threshold, by default it was
|
||||
initialized as 604800 seconds (equals to 7 days).
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/gc_reclaimed_segments
|
||||
Date: July 2021
|
||||
Contact: "Daeho Jeong" <daehojeong@google.com>
|
||||
Description: Show how many segments have been reclaimed by GC during a specific
|
||||
GC mode (0: GC normal, 1: GC idle CB, 2: GC idle greedy,
|
||||
3: GC idle AT, 4: GC urgent high, 5: GC urgent low 6: GC urgent mid)
|
||||
You can re-initialize this value to "0".
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/gc_segment_mode
|
||||
Date: July 2021
|
||||
Contact: "Daeho Jeong" <daehojeong@google.com>
|
||||
Description: You can control for which gc mode the "gc_reclaimed_segments" node shows.
|
||||
Refer to the description of the modes in "gc_reclaimed_segments".
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/max_fragment_chunk
|
||||
Date: August 2021
|
||||
Contact: "Daeho Jeong" <daehojeong@google.com>
|
||||
Description: With "mode=fragment:block" mount options, we can scatter block allocation.
|
||||
f2fs will allocate 1..<max_fragment_chunk> blocks in a chunk and make a hole
|
||||
in the length of 1..<max_fragment_hole> by turns. This value can be set
|
||||
between 1..512 and the default value is 4.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/max_fragment_hole
|
||||
Date: August 2021
|
||||
Contact: "Daeho Jeong" <daehojeong@google.com>
|
||||
Description: With "mode=fragment:block" mount options, we can scatter block allocation.
|
||||
f2fs will allocate 1..<max_fragment_chunk> blocks in a chunk and make a hole
|
||||
in the length of 1..<max_fragment_hole> by turns. This value can be set
|
||||
between 1..512 and the default value is 4.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/gc_urgent_high_remaining
|
||||
Date: December 2021
|
||||
Contact: "Daeho Jeong" <daehojeong@google.com>
|
||||
Description: You can set the trial count limit for GC urgent high mode with this value.
|
||||
If GC thread gets to the limit, the mode will turn back to GC normal mode.
|
||||
By default, the value is zero, which means there is no limit like before.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/max_roll_forward_node_blocks
|
||||
Date: January 2022
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description: Controls max # of node block writes to be used for roll forward
|
||||
recovery. This can limit the roll forward recovery time.
|
||||
|
956
Documentation/filesystems/f2fs.rst
Normal file
956
Documentation/filesystems/f2fs.rst
Normal file
@ -0,0 +1,956 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
==========================================
|
||||
WHAT IS Flash-Friendly File System (F2FS)?
|
||||
==========================================
|
||||
|
||||
NAND flash memory-based storage devices, such as SSD, eMMC, and SD cards, have
|
||||
been equipped on a variety systems ranging from mobile to server systems. Since
|
||||
they are known to have different characteristics from the conventional rotating
|
||||
disks, a file system, an upper layer to the storage device, should adapt to the
|
||||
changes from the sketch in the design level.
|
||||
|
||||
F2FS is a file system exploiting NAND flash memory-based storage devices, which
|
||||
is based on Log-structured File System (LFS). The design has been focused on
|
||||
addressing the fundamental issues in LFS, which are snowball effect of wandering
|
||||
tree and high cleaning overhead.
|
||||
|
||||
Since a NAND flash memory-based storage device shows different characteristic
|
||||
according to its internal geometry or flash memory management scheme, namely FTL,
|
||||
F2FS and its tools support various parameters not only for configuring on-disk
|
||||
layout, but also for selecting allocation and cleaning algorithms.
|
||||
|
||||
The following git tree provides the file system formatting tool (mkfs.f2fs),
|
||||
a consistency checking tool (fsck.f2fs), and a debugging tool (dump.f2fs).
|
||||
|
||||
- git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs-tools.git
|
||||
|
||||
For reporting bugs and sending patches, please use the following mailing list:
|
||||
|
||||
- linux-f2fs-devel@lists.sourceforge.net
|
||||
|
||||
Background and Design issues
|
||||
============================
|
||||
|
||||
Log-structured File System (LFS)
|
||||
--------------------------------
|
||||
"A log-structured file system writes all modifications to disk sequentially in
|
||||
a log-like structure, thereby speeding up both file writing and crash recovery.
|
||||
The log is the only structure on disk; it contains indexing information so that
|
||||
files can be read back from the log efficiently. In order to maintain large free
|
||||
areas on disk for fast writing, we divide the log into segments and use a
|
||||
segment cleaner to compress the live information from heavily fragmented
|
||||
segments." from Rosenblum, M. and Ousterhout, J. K., 1992, "The design and
|
||||
implementation of a log-structured file system", ACM Trans. Computer Systems
|
||||
10, 1, 26–52.
|
||||
|
||||
Wandering Tree Problem
|
||||
----------------------
|
||||
In LFS, when a file data is updated and written to the end of log, its direct
|
||||
pointer block is updated due to the changed location. Then the indirect pointer
|
||||
block is also updated due to the direct pointer block update. In this manner,
|
||||
the upper index structures such as inode, inode map, and checkpoint block are
|
||||
also updated recursively. This problem is called as wandering tree problem [1],
|
||||
and in order to enhance the performance, it should eliminate or relax the update
|
||||
propagation as much as possible.
|
||||
|
||||
[1] Bityutskiy, A. 2005. JFFS3 design issues. http://www.linux-mtd.infradead.org/
|
||||
|
||||
Cleaning Overhead
|
||||
-----------------
|
||||
Since LFS is based on out-of-place writes, it produces so many obsolete blocks
|
||||
scattered across the whole storage. In order to serve new empty log space, it
|
||||
needs to reclaim these obsolete blocks seamlessly to users. This job is called
|
||||
as a cleaning process.
|
||||
|
||||
The process consists of three operations as follows.
|
||||
|
||||
1. A victim segment is selected through referencing segment usage table.
|
||||
2. It loads parent index structures of all the data in the victim identified by
|
||||
segment summary blocks.
|
||||
3. It checks the cross-reference between the data and its parent index structure.
|
||||
4. It moves valid data selectively.
|
||||
|
||||
This cleaning job may cause unexpected long delays, so the most important goal
|
||||
is to hide the latencies to users. And also definitely, it should reduce the
|
||||
amount of valid data to be moved, and move them quickly as well.
|
||||
|
||||
Key Features
|
||||
============
|
||||
|
||||
Flash Awareness
|
||||
---------------
|
||||
- Enlarge the random write area for better performance, but provide the high
|
||||
spatial locality
|
||||
- Align FS data structures to the operational units in FTL as best efforts
|
||||
|
||||
Wandering Tree Problem
|
||||
----------------------
|
||||
- Use a term, “node”, that represents inodes as well as various pointer blocks
|
||||
- Introduce Node Address Table (NAT) containing the locations of all the “node”
|
||||
blocks; this will cut off the update propagation.
|
||||
|
||||
Cleaning Overhead
|
||||
-----------------
|
||||
- Support a background cleaning process
|
||||
- Support greedy and cost-benefit algorithms for victim selection policies
|
||||
- Support multi-head logs for static/dynamic hot and cold data separation
|
||||
- Introduce adaptive logging for efficient block allocation
|
||||
|
||||
Mount Options
|
||||
=============
|
||||
|
||||
|
||||
======================== ============================================================
|
||||
background_gc=%s Turn on/off cleaning operations, namely garbage
|
||||
collection, triggered in background when I/O subsystem is
|
||||
idle. If background_gc=on, it will turn on the garbage
|
||||
collection and if background_gc=off, garbage collection
|
||||
will be turned off. If background_gc=sync, it will turn
|
||||
on synchronous garbage collection running in background.
|
||||
Default value for this option is on. So garbage
|
||||
collection is on by default.
|
||||
gc_merge When background_gc is on, this option can be enabled to
|
||||
let background GC thread to handle foreground GC requests,
|
||||
it can eliminate the sluggish issue caused by slow foreground
|
||||
GC operation when GC is triggered from a process with limited
|
||||
I/O and CPU resources.
|
||||
nogc_merge Disable GC merge feature.
|
||||
disable_roll_forward Disable the roll-forward recovery routine
|
||||
norecovery Disable the roll-forward recovery routine, mounted read-
|
||||
only (i.e., -o ro,disable_roll_forward)
|
||||
discard/nodiscard Enable/disable real-time discard in f2fs, if discard is
|
||||
enabled, f2fs will issue discard/TRIM commands when a
|
||||
segment is cleaned.
|
||||
no_heap Disable heap-style segment allocation which finds free
|
||||
segments for data from the beginning of main area, while
|
||||
for node from the end of main area.
|
||||
nouser_xattr Disable Extended User Attributes. Note: xattr is enabled
|
||||
by default if CONFIG_F2FS_FS_XATTR is selected.
|
||||
noacl Disable POSIX Access Control List. Note: acl is enabled
|
||||
by default if CONFIG_F2FS_FS_POSIX_ACL is selected.
|
||||
active_logs=%u Support configuring the number of active logs. In the
|
||||
current design, f2fs supports only 2, 4, and 6 logs.
|
||||
Default number is 6.
|
||||
disable_ext_identify Disable the extension list configured by mkfs, so f2fs
|
||||
is not aware of cold files such as media files.
|
||||
inline_xattr Enable the inline xattrs feature.
|
||||
noinline_xattr Disable the inline xattrs feature.
|
||||
inline_xattr_size=%u Support configuring inline xattr size, it depends on
|
||||
flexible inline xattr feature.
|
||||
inline_data Enable the inline data feature: Newly created small (<~3.4k)
|
||||
files can be written into inode block.
|
||||
inline_dentry Enable the inline dir feature: data in newly created
|
||||
directory entries can be written into inode block. The
|
||||
space of inode block which is used to store inline
|
||||
dentries is limited to ~3.4k.
|
||||
noinline_dentry Disable the inline dentry feature.
|
||||
flush_merge Merge concurrent cache_flush commands as much as possible
|
||||
to eliminate redundant command issues. If the underlying
|
||||
device handles the cache_flush command relatively slowly,
|
||||
recommend to enable this option.
|
||||
nobarrier This option can be used if underlying storage guarantees
|
||||
its cached data should be written to the novolatile area.
|
||||
If this option is set, no cache_flush commands are issued
|
||||
but f2fs still guarantees the write ordering of all the
|
||||
data writes.
|
||||
fastboot This option is used when a system wants to reduce mount
|
||||
time as much as possible, even though normal performance
|
||||
can be sacrificed.
|
||||
extent_cache Enable an extent cache based on rb-tree, it can cache
|
||||
as many as extent which map between contiguous logical
|
||||
address and physical address per inode, resulting in
|
||||
increasing the cache hit ratio. Set by default.
|
||||
noextent_cache Disable an extent cache based on rb-tree explicitly, see
|
||||
the above extent_cache mount option.
|
||||
noinline_data Disable the inline data feature, inline data feature is
|
||||
enabled by default.
|
||||
data_flush Enable data flushing before checkpoint in order to
|
||||
persist data of regular and symlink.
|
||||
reserve_root=%d Support configuring reserved space which is used for
|
||||
allocation from a privileged user with specified uid or
|
||||
gid, unit: 4KB, the default limit is 0.2% of user blocks.
|
||||
resuid=%d The user ID which may use the reserved blocks.
|
||||
resgid=%d The group ID which may use the reserved blocks.
|
||||
fault_injection=%d Enable fault injection in all supported types with
|
||||
specified injection rate.
|
||||
fault_type=%d Support configuring fault injection type, should be
|
||||
enabled with fault_injection option, fault type value
|
||||
is shown below, it supports single or combined type.
|
||||
|
||||
=================== ===========
|
||||
Type_Name Type_Value
|
||||
=================== ===========
|
||||
FAULT_KMALLOC 0x000000001
|
||||
FAULT_KVMALLOC 0x000000002
|
||||
FAULT_PAGE_ALLOC 0x000000004
|
||||
FAULT_PAGE_GET 0x000000008
|
||||
FAULT_ALLOC_BIO 0x000000010 (obsolete)
|
||||
FAULT_ALLOC_NID 0x000000020
|
||||
FAULT_ORPHAN 0x000000040
|
||||
FAULT_BLOCK 0x000000080
|
||||
FAULT_DIR_DEPTH 0x000000100
|
||||
FAULT_EVICT_INODE 0x000000200
|
||||
FAULT_TRUNCATE 0x000000400
|
||||
FAULT_READ_IO 0x000000800
|
||||
FAULT_CHECKPOINT 0x000001000
|
||||
FAULT_DISCARD 0x000002000
|
||||
FAULT_WRITE_IO 0x000004000
|
||||
FAULT_SLAB_ALLOC 0x000008000
|
||||
FAULT_DQUOT_INIT 0x000010000
|
||||
FAULT_LOCK_OP 0x000020000
|
||||
=================== ===========
|
||||
mode=%s Control block allocation mode which supports "adaptive"
|
||||
and "lfs". In "lfs" mode, there should be no random
|
||||
writes towards main area.
|
||||
"fragment:segment" and "fragment:block" are newly added here.
|
||||
These are developer options for experiments to simulate filesystem
|
||||
fragmentation/after-GC situation itself. The developers use these
|
||||
modes to understand filesystem fragmentation/after-GC condition well,
|
||||
and eventually get some insights to handle them better.
|
||||
In "fragment:segment", f2fs allocates a new segment in ramdom
|
||||
position. With this, we can simulate the after-GC condition.
|
||||
In "fragment:block", we can scatter block allocation with
|
||||
"max_fragment_chunk" and "max_fragment_hole" sysfs nodes.
|
||||
We added some randomness to both chunk and hole size to make
|
||||
it close to realistic IO pattern. So, in this mode, f2fs will allocate
|
||||
1..<max_fragment_chunk> blocks in a chunk and make a hole in the
|
||||
length of 1..<max_fragment_hole> by turns. With this, the newly
|
||||
allocated blocks will be scattered throughout the whole partition.
|
||||
Note that "fragment:block" implicitly enables "fragment:segment"
|
||||
option for more randomness.
|
||||
Please, use these options for your experiments and we strongly
|
||||
recommend to re-format the filesystem after using these options.
|
||||
io_bits=%u Set the bit size of write IO requests. It should be set
|
||||
with "mode=lfs".
|
||||
usrquota Enable plain user disk quota accounting.
|
||||
grpquota Enable plain group disk quota accounting.
|
||||
prjquota Enable plain project quota accounting.
|
||||
usrjquota=<file> Appoint specified file and type during mount, so that quota
|
||||
grpjquota=<file> information can be properly updated during recovery flow,
|
||||
prjjquota=<file> <quota file>: must be in root directory;
|
||||
jqfmt=<quota type> <quota type>: [vfsold,vfsv0,vfsv1].
|
||||
offusrjquota Turn off user journalled quota.
|
||||
offgrpjquota Turn off group journalled quota.
|
||||
offprjjquota Turn off project journalled quota.
|
||||
quota Enable plain user disk quota accounting.
|
||||
noquota Disable all plain disk quota option.
|
||||
whint_mode=%s Control which write hints are passed down to block
|
||||
layer. This supports "off", "user-based", and
|
||||
"fs-based". In "off" mode (default), f2fs does not pass
|
||||
down hints. In "user-based" mode, f2fs tries to pass
|
||||
down hints given by users. And in "fs-based" mode, f2fs
|
||||
passes down hints with its policy.
|
||||
alloc_mode=%s Adjust block allocation policy, which supports "reuse"
|
||||
and "default".
|
||||
fsync_mode=%s Control the policy of fsync. Currently supports "posix",
|
||||
"strict", and "nobarrier". In "posix" mode, which is
|
||||
default, fsync will follow POSIX semantics and does a
|
||||
light operation to improve the filesystem performance.
|
||||
In "strict" mode, fsync will be heavy and behaves in line
|
||||
with xfs, ext4 and btrfs, where xfstest generic/342 will
|
||||
pass, but the performance will regress. "nobarrier" is
|
||||
based on "posix", but doesn't issue flush command for
|
||||
non-atomic files likewise "nobarrier" mount option.
|
||||
test_dummy_encryption
|
||||
test_dummy_encryption=%s
|
||||
Enable dummy encryption, which provides a fake fscrypt
|
||||
context. The fake fscrypt context is used by xfstests.
|
||||
The argument may be either "v1" or "v2", in order to
|
||||
select the corresponding fscrypt policy version.
|
||||
checkpoint=%s[:%u[%]] Set to "disable" to turn off checkpointing. Set to "enable"
|
||||
to reenable checkpointing. Is enabled by default. While
|
||||
disabled, any unmounting or unexpected shutdowns will cause
|
||||
the filesystem contents to appear as they did when the
|
||||
filesystem was mounted with that option.
|
||||
While mounting with checkpoint=disabled, the filesystem must
|
||||
run garbage collection to ensure that all available space can
|
||||
be used. If this takes too much time, the mount may return
|
||||
EAGAIN. You may optionally add a value to indicate how much
|
||||
of the disk you would be willing to temporarily give up to
|
||||
avoid additional garbage collection. This can be given as a
|
||||
number of blocks, or as a percent. For instance, mounting
|
||||
with checkpoint=disable:100% would always succeed, but it may
|
||||
hide up to all remaining free space. The actual space that
|
||||
would be unusable can be viewed at /sys/fs/f2fs/<disk>/unusable
|
||||
This space is reclaimed once checkpoint=enable.
|
||||
checkpoint_merge When checkpoint is enabled, this can be used to create a kernel
|
||||
daemon and make it to merge concurrent checkpoint requests as
|
||||
much as possible to eliminate redundant checkpoint issues. Plus,
|
||||
we can eliminate the sluggish issue caused by slow checkpoint
|
||||
operation when the checkpoint is done in a process context in
|
||||
a cgroup having low i/o budget and cpu shares. To make this
|
||||
do better, we set the default i/o priority of the kernel daemon
|
||||
to "3", to give one higher priority than other kernel threads.
|
||||
This is the same way to give a I/O priority to the jbd2
|
||||
journaling thread of ext4 filesystem.
|
||||
nocheckpoint_merge Disable checkpoint merge feature.
|
||||
compress_algorithm=%s Control compress algorithm, currently f2fs supports "lzo",
|
||||
"lz4", "zstd" and "lzo-rle" algorithm.
|
||||
compress_algorithm=%s:%d Control compress algorithm and its compress level, now, only
|
||||
"lz4" and "zstd" support compress level config.
|
||||
algorithm level range
|
||||
lz4 3 - 16
|
||||
zstd 1 - 22
|
||||
compress_log_size=%u Support configuring compress cluster size, the size will
|
||||
be 4KB * (1 << %u), 16KB is minimum size, also it's
|
||||
default size.
|
||||
compress_extension=%s Support adding specified extension, so that f2fs can enable
|
||||
compression on those corresponding files, e.g. if all files
|
||||
with '.ext' has high compression rate, we can set the '.ext'
|
||||
on compression extension list and enable compression on
|
||||
these file by default rather than to enable it via ioctl.
|
||||
For other files, we can still enable compression via ioctl.
|
||||
Note that, there is one reserved special extension '*', it
|
||||
can be set to enable compression for all files.
|
||||
nocompress_extension=%s Support adding specified extension, so that f2fs can disable
|
||||
compression on those corresponding files, just contrary to compression extension.
|
||||
If you know exactly which files cannot be compressed, you can use this.
|
||||
The same extension name can't appear in both compress and nocompress
|
||||
extension at the same time.
|
||||
If the compress extension specifies all files, the types specified by the
|
||||
nocompress extension will be treated as special cases and will not be compressed.
|
||||
Don't allow use '*' to specifie all file in nocompress extension.
|
||||
After add nocompress_extension, the priority should be:
|
||||
dir_flag < comp_extention,nocompress_extension < comp_file_flag,no_comp_file_flag.
|
||||
See more in compression sections.
|
||||
|
||||
compress_chksum Support verifying chksum of raw data in compressed cluster.
|
||||
compress_mode=%s Control file compression mode. This supports "fs" and "user"
|
||||
modes. In "fs" mode (default), f2fs does automatic compression
|
||||
on the compression enabled files. In "user" mode, f2fs disables
|
||||
the automaic compression and gives the user discretion of
|
||||
choosing the target file and the timing. The user can do manual
|
||||
compression/decompression on the compression enabled files using
|
||||
ioctls.
|
||||
compress_cache Support to use address space of a filesystem managed inode to
|
||||
cache compressed block, in order to improve cache hit ratio of
|
||||
random read.
|
||||
inlinecrypt When possible, encrypt/decrypt the contents of encrypted
|
||||
files using the blk-crypto framework rather than
|
||||
filesystem-layer encryption. This allows the use of
|
||||
inline encryption hardware. The on-disk format is
|
||||
unaffected. For more details, see
|
||||
Documentation/block/inline-encryption.rst.
|
||||
atgc Enable age-threshold garbage collection, it provides high
|
||||
effectiveness and efficiency on background GC.
|
||||
discard_unit=%s Control discard unit, the argument can be "block", "segment"
|
||||
and "section", issued discard command's offset/size will be
|
||||
aligned to the unit, by default, "discard_unit=block" is set,
|
||||
so that small discard functionality is enabled.
|
||||
For blkzoned device, "discard_unit=section" will be set by
|
||||
default, it is helpful for large sized SMR or ZNS devices to
|
||||
reduce memory cost by getting rid of fs metadata supports small
|
||||
discard.
|
||||
======================== ============================================================
|
||||
|
||||
Debugfs Entries
|
||||
===============
|
||||
|
||||
/sys/kernel/debug/f2fs/ contains information about all the partitions mounted as
|
||||
f2fs. Each file shows the whole f2fs information.
|
||||
|
||||
/sys/kernel/debug/f2fs/status includes:
|
||||
|
||||
- major file system information managed by f2fs currently
|
||||
- average SIT information about whole segments
|
||||
- current memory footprint consumed by f2fs.
|
||||
|
||||
Sysfs Entries
|
||||
=============
|
||||
|
||||
Information about mounted f2fs file systems can be found in
|
||||
/sys/fs/f2fs. Each mounted filesystem will have a directory in
|
||||
/sys/fs/f2fs based on its device name (i.e., /sys/fs/f2fs/sda).
|
||||
The files in each per-device directory are shown in table below.
|
||||
|
||||
Files in /sys/fs/f2fs/<devname>
|
||||
(see also Documentation/ABI/testing/sysfs-fs-f2fs)
|
||||
|
||||
Usage
|
||||
=====
|
||||
|
||||
1. Download userland tools and compile them.
|
||||
|
||||
2. Skip, if f2fs was compiled statically inside kernel.
|
||||
Otherwise, insert the f2fs.ko module::
|
||||
|
||||
# insmod f2fs.ko
|
||||
|
||||
3. Create a directory to use when mounting::
|
||||
|
||||
# mkdir /mnt/f2fs
|
||||
|
||||
4. Format the block device, and then mount as f2fs::
|
||||
|
||||
# mkfs.f2fs -l label /dev/block_device
|
||||
# mount -t f2fs /dev/block_device /mnt/f2fs
|
||||
|
||||
mkfs.f2fs
|
||||
---------
|
||||
The mkfs.f2fs is for the use of formatting a partition as the f2fs filesystem,
|
||||
which builds a basic on-disk layout.
|
||||
|
||||
The quick options consist of:
|
||||
|
||||
=============== ===========================================================
|
||||
``-l [label]`` Give a volume label, up to 512 unicode name.
|
||||
``-a [0 or 1]`` Split start location of each area for heap-based allocation.
|
||||
|
||||
1 is set by default, which performs this.
|
||||
``-o [int]`` Set overprovision ratio in percent over volume size.
|
||||
|
||||
5 is set by default.
|
||||
``-s [int]`` Set the number of segments per section.
|
||||
|
||||
1 is set by default.
|
||||
``-z [int]`` Set the number of sections per zone.
|
||||
|
||||
1 is set by default.
|
||||
``-e [str]`` Set basic extension list. e.g. "mp3,gif,mov"
|
||||
``-t [0 or 1]`` Disable discard command or not.
|
||||
|
||||
1 is set by default, which conducts discard.
|
||||
=============== ===========================================================
|
||||
|
||||
Note: please refer to the manpage of mkfs.f2fs(8) to get full option list.
|
||||
|
||||
fsck.f2fs
|
||||
---------
|
||||
The fsck.f2fs is a tool to check the consistency of an f2fs-formatted
|
||||
partition, which examines whether the filesystem metadata and user-made data
|
||||
are cross-referenced correctly or not.
|
||||
Note that, initial version of the tool does not fix any inconsistency.
|
||||
|
||||
The quick options consist of::
|
||||
|
||||
-d debug level [default:0]
|
||||
|
||||
Note: please refer to the manpage of fsck.f2fs(8) to get full option list.
|
||||
|
||||
dump.f2fs
|
||||
---------
|
||||
The dump.f2fs shows the information of specific inode and dumps SSA and SIT to
|
||||
file. Each file is dump_ssa and dump_sit.
|
||||
|
||||
The dump.f2fs is used to debug on-disk data structures of the f2fs filesystem.
|
||||
It shows on-disk inode information recognized by a given inode number, and is
|
||||
able to dump all the SSA and SIT entries into predefined files, ./dump_ssa and
|
||||
./dump_sit respectively.
|
||||
|
||||
The options consist of::
|
||||
|
||||
-d debug level [default:0]
|
||||
-i inode no (hex)
|
||||
-s [SIT dump segno from #1~#2 (decimal), for all 0~-1]
|
||||
-a [SSA dump segno from #1~#2 (decimal), for all 0~-1]
|
||||
|
||||
Examples::
|
||||
|
||||
# dump.f2fs -i [ino] /dev/sdx
|
||||
# dump.f2fs -s 0~-1 /dev/sdx (SIT dump)
|
||||
# dump.f2fs -a 0~-1 /dev/sdx (SSA dump)
|
||||
|
||||
Note: please refer to the manpage of dump.f2fs(8) to get full option list.
|
||||
|
||||
sload.f2fs
|
||||
----------
|
||||
The sload.f2fs gives a way to insert files and directories in the exisiting disk
|
||||
image. This tool is useful when building f2fs images given compiled files.
|
||||
|
||||
Note: please refer to the manpage of sload.f2fs(8) to get full option list.
|
||||
|
||||
resize.f2fs
|
||||
-----------
|
||||
The resize.f2fs lets a user resize the f2fs-formatted disk image, while preserving
|
||||
all the files and directories stored in the image.
|
||||
|
||||
Note: please refer to the manpage of resize.f2fs(8) to get full option list.
|
||||
|
||||
defrag.f2fs
|
||||
-----------
|
||||
The defrag.f2fs can be used to defragment scattered written data as well as
|
||||
filesystem metadata across the disk. This can improve the write speed by giving
|
||||
more free consecutive space.
|
||||
|
||||
Note: please refer to the manpage of defrag.f2fs(8) to get full option list.
|
||||
|
||||
f2fs_io
|
||||
-------
|
||||
The f2fs_io is a simple tool to issue various filesystem APIs as well as
|
||||
f2fs-specific ones, which is very useful for QA tests.
|
||||
|
||||
Note: please refer to the manpage of f2fs_io(8) to get full option list.
|
||||
|
||||
Design
|
||||
======
|
||||
|
||||
On-disk Layout
|
||||
--------------
|
||||
|
||||
F2FS divides the whole volume into a number of segments, each of which is fixed
|
||||
to 2MB in size. A section is composed of consecutive segments, and a zone
|
||||
consists of a set of sections. By default, section and zone sizes are set to one
|
||||
segment size identically, but users can easily modify the sizes by mkfs.
|
||||
|
||||
F2FS splits the entire volume into six areas, and all the areas except superblock
|
||||
consist of multiple segments as described below::
|
||||
|
||||
align with the zone size <-|
|
||||
|-> align with the segment size
|
||||
_________________________________________________________________________
|
||||
| | | Segment | Node | Segment | |
|
||||
| Superblock | Checkpoint | Info. | Address | Summary | Main |
|
||||
| (SB) | (CP) | Table (SIT) | Table (NAT) | Area (SSA) | |
|
||||
|____________|_____2______|______N______|______N______|______N_____|__N___|
|
||||
. .
|
||||
. .
|
||||
. .
|
||||
._________________________________________.
|
||||
|_Segment_|_..._|_Segment_|_..._|_Segment_|
|
||||
. .
|
||||
._________._________
|
||||
|_section_|__...__|_
|
||||
. .
|
||||
.________.
|
||||
|__zone__|
|
||||
|
||||
- Superblock (SB)
|
||||
It is located at the beginning of the partition, and there exist two copies
|
||||
to avoid file system crash. It contains basic partition information and some
|
||||
default parameters of f2fs.
|
||||
|
||||
- Checkpoint (CP)
|
||||
It contains file system information, bitmaps for valid NAT/SIT sets, orphan
|
||||
inode lists, and summary entries of current active segments.
|
||||
|
||||
- Segment Information Table (SIT)
|
||||
It contains segment information such as valid block count and bitmap for the
|
||||
validity of all the blocks.
|
||||
|
||||
- Node Address Table (NAT)
|
||||
It is composed of a block address table for all the node blocks stored in
|
||||
Main area.
|
||||
|
||||
- Segment Summary Area (SSA)
|
||||
It contains summary entries which contains the owner information of all the
|
||||
data and node blocks stored in Main area.
|
||||
|
||||
- Main Area
|
||||
It contains file and directory data including their indices.
|
||||
|
||||
In order to avoid misalignment between file system and flash-based storage, F2FS
|
||||
aligns the start block address of CP with the segment size. Also, it aligns the
|
||||
start block address of Main area with the zone size by reserving some segments
|
||||
in SSA area.
|
||||
|
||||
Reference the following survey for additional technical details.
|
||||
https://wiki.linaro.org/WorkingGroups/Kernel/Projects/FlashCardSurvey
|
||||
|
||||
File System Metadata Structure
|
||||
------------------------------
|
||||
|
||||
F2FS adopts the checkpointing scheme to maintain file system consistency. At
|
||||
mount time, F2FS first tries to find the last valid checkpoint data by scanning
|
||||
CP area. In order to reduce the scanning time, F2FS uses only two copies of CP.
|
||||
One of them always indicates the last valid data, which is called as shadow copy
|
||||
mechanism. In addition to CP, NAT and SIT also adopt the shadow copy mechanism.
|
||||
|
||||
For file system consistency, each CP points to which NAT and SIT copies are
|
||||
valid, as shown as below::
|
||||
|
||||
+--------+----------+---------+
|
||||
| CP | SIT | NAT |
|
||||
+--------+----------+---------+
|
||||
. . . .
|
||||
. . . .
|
||||
. . . .
|
||||
+-------+-------+--------+--------+--------+--------+
|
||||
| CP #0 | CP #1 | SIT #0 | SIT #1 | NAT #0 | NAT #1 |
|
||||
+-------+-------+--------+--------+--------+--------+
|
||||
| ^ ^
|
||||
| | |
|
||||
`----------------------------------------'
|
||||
|
||||
Index Structure
|
||||
---------------
|
||||
|
||||
The key data structure to manage the data locations is a "node". Similar to
|
||||
traditional file structures, F2FS has three types of node: inode, direct node,
|
||||
indirect node. F2FS assigns 4KB to an inode block which contains 923 data block
|
||||
indices, two direct node pointers, two indirect node pointers, and one double
|
||||
indirect node pointer as described below. One direct node block contains 1018
|
||||
data blocks, and one indirect node block contains also 1018 node blocks. Thus,
|
||||
one inode block (i.e., a file) covers::
|
||||
|
||||
4KB * (923 + 2 * 1018 + 2 * 1018 * 1018 + 1018 * 1018 * 1018) := 3.94TB.
|
||||
|
||||
Inode block (4KB)
|
||||
|- data (923)
|
||||
|- direct node (2)
|
||||
| `- data (1018)
|
||||
|- indirect node (2)
|
||||
| `- direct node (1018)
|
||||
| `- data (1018)
|
||||
`- double indirect node (1)
|
||||
`- indirect node (1018)
|
||||
`- direct node (1018)
|
||||
`- data (1018)
|
||||
|
||||
Note that all the node blocks are mapped by NAT which means the location of
|
||||
each node is translated by the NAT table. In the consideration of the wandering
|
||||
tree problem, F2FS is able to cut off the propagation of node updates caused by
|
||||
leaf data writes.
|
||||
|
||||
Directory Structure
|
||||
-------------------
|
||||
|
||||
A directory entry occupies 11 bytes, which consists of the following attributes.
|
||||
|
||||
- hash hash value of the file name
|
||||
- ino inode number
|
||||
- len the length of file name
|
||||
- type file type such as directory, symlink, etc
|
||||
|
||||
A dentry block consists of 214 dentry slots and file names. Therein a bitmap is
|
||||
used to represent whether each dentry is valid or not. A dentry block occupies
|
||||
4KB with the following composition.
|
||||
|
||||
::
|
||||
|
||||
Dentry Block(4 K) = bitmap (27 bytes) + reserved (3 bytes) +
|
||||
dentries(11 * 214 bytes) + file name (8 * 214 bytes)
|
||||
|
||||
[Bucket]
|
||||
+--------------------------------+
|
||||
|dentry block 1 | dentry block 2 |
|
||||
+--------------------------------+
|
||||
. .
|
||||
. .
|
||||
. [Dentry Block Structure: 4KB] .
|
||||
+--------+----------+----------+------------+
|
||||
| bitmap | reserved | dentries | file names |
|
||||
+--------+----------+----------+------------+
|
||||
[Dentry Block: 4KB] . .
|
||||
. .
|
||||
. .
|
||||
+------+------+-----+------+
|
||||
| hash | ino | len | type |
|
||||
+------+------+-----+------+
|
||||
[Dentry Structure: 11 bytes]
|
||||
|
||||
F2FS implements multi-level hash tables for directory structure. Each level has
|
||||
a hash table with dedicated number of hash buckets as shown below. Note that
|
||||
"A(2B)" means a bucket includes 2 data blocks.
|
||||
|
||||
::
|
||||
|
||||
----------------------
|
||||
A : bucket
|
||||
B : block
|
||||
N : MAX_DIR_HASH_DEPTH
|
||||
----------------------
|
||||
|
||||
level #0 | A(2B)
|
||||
|
|
||||
level #1 | A(2B) - A(2B)
|
||||
|
|
||||
level #2 | A(2B) - A(2B) - A(2B) - A(2B)
|
||||
. | . . . .
|
||||
level #N/2 | A(2B) - A(2B) - A(2B) - A(2B) - A(2B) - ... - A(2B)
|
||||
. | . . . .
|
||||
level #N | A(4B) - A(4B) - A(4B) - A(4B) - A(4B) - ... - A(4B)
|
||||
|
||||
The number of blocks and buckets are determined by::
|
||||
|
||||
,- 2, if n < MAX_DIR_HASH_DEPTH / 2,
|
||||
# of blocks in level #n = |
|
||||
`- 4, Otherwise
|
||||
|
||||
,- 2^(n + dir_level),
|
||||
| if n + dir_level < MAX_DIR_HASH_DEPTH / 2,
|
||||
# of buckets in level #n = |
|
||||
`- 2^((MAX_DIR_HASH_DEPTH / 2) - 1),
|
||||
Otherwise
|
||||
|
||||
When F2FS finds a file name in a directory, at first a hash value of the file
|
||||
name is calculated. Then, F2FS scans the hash table in level #0 to find the
|
||||
dentry consisting of the file name and its inode number. If not found, F2FS
|
||||
scans the next hash table in level #1. In this way, F2FS scans hash tables in
|
||||
each levels incrementally from 1 to N. In each level F2FS needs to scan only
|
||||
one bucket determined by the following equation, which shows O(log(# of files))
|
||||
complexity::
|
||||
|
||||
bucket number to scan in level #n = (hash value) % (# of buckets in level #n)
|
||||
|
||||
In the case of file creation, F2FS finds empty consecutive slots that cover the
|
||||
file name. F2FS searches the empty slots in the hash tables of whole levels from
|
||||
1 to N in the same way as the lookup operation.
|
||||
|
||||
The following figure shows an example of two cases holding children::
|
||||
|
||||
--------------> Dir <--------------
|
||||
| |
|
||||
child child
|
||||
|
||||
child - child [hole] - child
|
||||
|
||||
child - child - child [hole] - [hole] - child
|
||||
|
||||
Case 1: Case 2:
|
||||
Number of children = 6, Number of children = 3,
|
||||
File size = 7 File size = 7
|
||||
|
||||
Default Block Allocation
|
||||
------------------------
|
||||
|
||||
At runtime, F2FS manages six active logs inside "Main" area: Hot/Warm/Cold node
|
||||
and Hot/Warm/Cold data.
|
||||
|
||||
- Hot node contains direct node blocks of directories.
|
||||
- Warm node contains direct node blocks except hot node blocks.
|
||||
- Cold node contains indirect node blocks
|
||||
- Hot data contains dentry blocks
|
||||
- Warm data contains data blocks except hot and cold data blocks
|
||||
- Cold data contains multimedia data or migrated data blocks
|
||||
|
||||
LFS has two schemes for free space management: threaded log and copy-and-compac-
|
||||
tion. The copy-and-compaction scheme which is known as cleaning, is well-suited
|
||||
for devices showing very good sequential write performance, since free segments
|
||||
are served all the time for writing new data. However, it suffers from cleaning
|
||||
overhead under high utilization. Contrarily, the threaded log scheme suffers
|
||||
from random writes, but no cleaning process is needed. F2FS adopts a hybrid
|
||||
scheme where the copy-and-compaction scheme is adopted by default, but the
|
||||
policy is dynamically changed to the threaded log scheme according to the file
|
||||
system status.
|
||||
|
||||
In order to align F2FS with underlying flash-based storage, F2FS allocates a
|
||||
segment in a unit of section. F2FS expects that the section size would be the
|
||||
same as the unit size of garbage collection in FTL. Furthermore, with respect
|
||||
to the mapping granularity in FTL, F2FS allocates each section of the active
|
||||
logs from different zones as much as possible, since FTL can write the data in
|
||||
the active logs into one allocation unit according to its mapping granularity.
|
||||
|
||||
Cleaning process
|
||||
----------------
|
||||
|
||||
F2FS does cleaning both on demand and in the background. On-demand cleaning is
|
||||
triggered when there are not enough free segments to serve VFS calls. Background
|
||||
cleaner is operated by a kernel thread, and triggers the cleaning job when the
|
||||
system is idle.
|
||||
|
||||
F2FS supports two victim selection policies: greedy and cost-benefit algorithms.
|
||||
In the greedy algorithm, F2FS selects a victim segment having the smallest number
|
||||
of valid blocks. In the cost-benefit algorithm, F2FS selects a victim segment
|
||||
according to the segment age and the number of valid blocks in order to address
|
||||
log block thrashing problem in the greedy algorithm. F2FS adopts the greedy
|
||||
algorithm for on-demand cleaner, while background cleaner adopts cost-benefit
|
||||
algorithm.
|
||||
|
||||
In order to identify whether the data in the victim segment are valid or not,
|
||||
F2FS manages a bitmap. Each bit represents the validity of a block, and the
|
||||
bitmap is composed of a bit stream covering whole blocks in main area.
|
||||
|
||||
Write-hint Policy
|
||||
-----------------
|
||||
|
||||
1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET.
|
||||
|
||||
2) whint_mode=user-based. F2FS tries to pass down hints given by
|
||||
users.
|
||||
|
||||
===================== ======================== ===================
|
||||
User F2FS Block
|
||||
===================== ======================== ===================
|
||||
N/A META WRITE_LIFE_NOT_SET
|
||||
N/A HOT_NODE "
|
||||
N/A WARM_NODE "
|
||||
N/A COLD_NODE "
|
||||
ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME
|
||||
extension list " "
|
||||
|
||||
-- buffered io
|
||||
WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
|
||||
WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
|
||||
WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
|
||||
WRITE_LIFE_NONE " "
|
||||
WRITE_LIFE_MEDIUM " "
|
||||
WRITE_LIFE_LONG " "
|
||||
|
||||
-- direct io
|
||||
WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
|
||||
WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
|
||||
WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
|
||||
WRITE_LIFE_NONE " WRITE_LIFE_NONE
|
||||
WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM
|
||||
WRITE_LIFE_LONG " WRITE_LIFE_LONG
|
||||
===================== ======================== ===================
|
||||
|
||||
3) whint_mode=fs-based. F2FS passes down hints with its policy.
|
||||
|
||||
===================== ======================== ===================
|
||||
User F2FS Block
|
||||
===================== ======================== ===================
|
||||
N/A META WRITE_LIFE_MEDIUM;
|
||||
N/A HOT_NODE WRITE_LIFE_NOT_SET
|
||||
N/A WARM_NODE "
|
||||
N/A COLD_NODE WRITE_LIFE_NONE
|
||||
ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME
|
||||
extension list " "
|
||||
|
||||
-- buffered io
|
||||
WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
|
||||
WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
|
||||
WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG
|
||||
WRITE_LIFE_NONE " "
|
||||
WRITE_LIFE_MEDIUM " "
|
||||
WRITE_LIFE_LONG " "
|
||||
|
||||
-- direct io
|
||||
WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
|
||||
WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
|
||||
WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
|
||||
WRITE_LIFE_NONE " WRITE_LIFE_NONE
|
||||
WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM
|
||||
WRITE_LIFE_LONG " WRITE_LIFE_LONG
|
||||
===================== ======================== ===================
|
||||
|
||||
Fallocate(2) Policy
|
||||
-------------------
|
||||
|
||||
The default policy follows the below POSIX rule.
|
||||
|
||||
Allocating disk space
|
||||
The default operation (i.e., mode is zero) of fallocate() allocates
|
||||
the disk space within the range specified by offset and len. The
|
||||
file size (as reported by stat(2)) will be changed if offset+len is
|
||||
greater than the file size. Any subregion within the range specified
|
||||
by offset and len that did not contain data before the call will be
|
||||
initialized to zero. This default behavior closely resembles the
|
||||
behavior of the posix_fallocate(3) library function, and is intended
|
||||
as a method of optimally implementing that function.
|
||||
|
||||
However, once F2FS receives ioctl(fd, F2FS_IOC_SET_PIN_FILE) in prior to
|
||||
fallocate(fd, DEFAULT_MODE), it allocates on-disk block addressess having
|
||||
zero or random data, which is useful to the below scenario where:
|
||||
|
||||
1. create(fd)
|
||||
2. ioctl(fd, F2FS_IOC_SET_PIN_FILE)
|
||||
3. fallocate(fd, 0, 0, size)
|
||||
4. address = fibmap(fd, offset)
|
||||
5. open(blkdev)
|
||||
6. write(blkdev, address)
|
||||
|
||||
Compression implementation
|
||||
--------------------------
|
||||
|
||||
- New term named cluster is defined as basic unit of compression, file can
|
||||
be divided into multiple clusters logically. One cluster includes 4 << n
|
||||
(n >= 0) logical pages, compression size is also cluster size, each of
|
||||
cluster can be compressed or not.
|
||||
|
||||
- In cluster metadata layout, one special block address is used to indicate
|
||||
a cluster is a compressed one or normal one; for compressed cluster, following
|
||||
metadata maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs
|
||||
stores data including compress header and compressed data.
|
||||
|
||||
- In order to eliminate write amplification during overwrite, F2FS only
|
||||
support compression on write-once file, data can be compressed only when
|
||||
all logical blocks in cluster contain valid data and compress ratio of
|
||||
cluster data is lower than specified threshold.
|
||||
|
||||
- To enable compression on regular inode, there are four ways:
|
||||
|
||||
* chattr +c file
|
||||
* chattr +c dir; touch dir/file
|
||||
* mount w/ -o compress_extension=ext; touch file.ext
|
||||
* mount w/ -o compress_extension=*; touch any_file
|
||||
|
||||
- To disable compression on regular inode, there are two ways:
|
||||
|
||||
* chattr -c file
|
||||
* mount w/ -o nocompress_extension=ext; touch file.ext
|
||||
|
||||
- Priority in between FS_COMPR_FL, FS_NOCOMP_FS, extensions:
|
||||
|
||||
* compress_extension=so; nocompress_extension=zip; chattr +c dir; touch
|
||||
dir/foo.so; touch dir/bar.zip; touch dir/baz.txt; then foo.so and baz.txt
|
||||
should be compresse, bar.zip should be non-compressed. chattr +c dir/bar.zip
|
||||
can enable compress on bar.zip.
|
||||
* compress_extension=so; nocompress_extension=zip; chattr -c dir; touch
|
||||
dir/foo.so; touch dir/bar.zip; touch dir/baz.txt; then foo.so should be
|
||||
compresse, bar.zip and baz.txt should be non-compressed.
|
||||
chattr+c dir/bar.zip; chattr+c dir/baz.txt; can enable compress on bar.zip
|
||||
and baz.txt.
|
||||
|
||||
- At this point, compression feature doesn't expose compressed space to user
|
||||
directly in order to guarantee potential data updates later to the space.
|
||||
Instead, the main goal is to reduce data writes to flash disk as much as
|
||||
possible, resulting in extending disk life time as well as relaxing IO
|
||||
congestion. Alternatively, we've added ioctl(F2FS_IOC_RELEASE_COMPRESS_BLOCKS)
|
||||
interface to reclaim compressed space and show it to user after putting the
|
||||
immutable bit. Immutable bit, after release, it doesn't allow writing/mmaping
|
||||
on the file, until reserving compressed space via
|
||||
ioctl(F2FS_IOC_RESERVE_COMPRESS_BLOCKS) or truncating filesize to zero.
|
||||
|
||||
Compress metadata layout::
|
||||
|
||||
[Dnode Structure]
|
||||
+-----------------------------------------------+
|
||||
| cluster 1 | cluster 2 | ......... | cluster N |
|
||||
+-----------------------------------------------+
|
||||
. . . .
|
||||
. . . .
|
||||
. Compressed Cluster . . Normal Cluster .
|
||||
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|
||||
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
|
||||
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|
||||
. .
|
||||
. .
|
||||
. .
|
||||
+-------------+-------------+----------+----------------------------+
|
||||
| data length | data chksum | reserved | compressed data |
|
||||
+-------------+-------------+----------+----------------------------+
|
||||
|
||||
Compression mode
|
||||
--------------------------
|
||||
|
||||
f2fs supports "fs" and "user" compression modes with "compression_mode" mount option.
|
||||
With this option, f2fs provides a choice to select the way how to compress the
|
||||
compression enabled files (refer to "Compression implementation" section for how to
|
||||
enable compression on a regular inode).
|
||||
|
||||
1) compress_mode=fs
|
||||
This is the default option. f2fs does automatic compression in the writeback of the
|
||||
compression enabled files.
|
||||
|
||||
2) compress_mode=user
|
||||
This disables the automatic compression and gives the user discretion of choosing the
|
||||
target file and the timing. The user can do manual compression/decompression on the
|
||||
compression enabled files using F2FS_IOC_DECOMPRESS_FILE and F2FS_IOC_COMPRESS_FILE
|
||||
ioctls like the below.
|
||||
|
||||
To decompress a file,
|
||||
|
||||
fd = open(filename, O_WRONLY, 0);
|
||||
ret = ioctl(fd, F2FS_IOC_DECOMPRESS_FILE);
|
||||
|
||||
To compress a file,
|
||||
|
||||
fd = open(filename, O_WRONLY, 0);
|
||||
ret = ioctl(fd, F2FS_IOC_COMPRESS_FILE);
|
||||
|
||||
NVMe Zoned Namespace devices
|
||||
----------------------------
|
||||
|
||||
- ZNS defines a per-zone capacity which can be equal or less than the
|
||||
zone-size. Zone-capacity is the number of usable blocks in the zone.
|
||||
F2FS checks if zone-capacity is less than zone-size, if it is, then any
|
||||
segment which starts after the zone-capacity is marked as not-free in
|
||||
the free segment bitmap at initial mount time. These segments are marked
|
||||
as permanently used so they are not allocated for writes and
|
||||
consequently are not needed to be garbage collected. In case the
|
||||
zone-capacity is not aligned to default segment size(2MB), then a segment
|
||||
can start before the zone-capacity and span across zone-capacity boundary.
|
||||
Such spanning segments are also considered as usable segments. All blocks
|
||||
past the zone-capacity are considered unusable in these segments.
|
@ -1,674 +0,0 @@
|
||||
================================================================================
|
||||
WHAT IS Flash-Friendly File System (F2FS)?
|
||||
================================================================================
|
||||
|
||||
NAND flash memory-based storage devices, such as SSD, eMMC, and SD cards, have
|
||||
been equipped on a variety systems ranging from mobile to server systems. Since
|
||||
they are known to have different characteristics from the conventional rotating
|
||||
disks, a file system, an upper layer to the storage device, should adapt to the
|
||||
changes from the sketch in the design level.
|
||||
|
||||
F2FS is a file system exploiting NAND flash memory-based storage devices, which
|
||||
is based on Log-structured File System (LFS). The design has been focused on
|
||||
addressing the fundamental issues in LFS, which are snowball effect of wandering
|
||||
tree and high cleaning overhead.
|
||||
|
||||
Since a NAND flash memory-based storage device shows different characteristic
|
||||
according to its internal geometry or flash memory management scheme, namely FTL,
|
||||
F2FS and its tools support various parameters not only for configuring on-disk
|
||||
layout, but also for selecting allocation and cleaning algorithms.
|
||||
|
||||
The following git tree provides the file system formatting tool (mkfs.f2fs),
|
||||
a consistency checking tool (fsck.f2fs), and a debugging tool (dump.f2fs).
|
||||
>> git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs-tools.git
|
||||
|
||||
For reporting bugs and sending patches, please use the following mailing list:
|
||||
>> linux-f2fs-devel@lists.sourceforge.net
|
||||
|
||||
================================================================================
|
||||
BACKGROUND AND DESIGN ISSUES
|
||||
================================================================================
|
||||
|
||||
Log-structured File System (LFS)
|
||||
--------------------------------
|
||||
"A log-structured file system writes all modifications to disk sequentially in
|
||||
a log-like structure, thereby speeding up both file writing and crash recovery.
|
||||
The log is the only structure on disk; it contains indexing information so that
|
||||
files can be read back from the log efficiently. In order to maintain large free
|
||||
areas on disk for fast writing, we divide the log into segments and use a
|
||||
segment cleaner to compress the live information from heavily fragmented
|
||||
segments." from Rosenblum, M. and Ousterhout, J. K., 1992, "The design and
|
||||
implementation of a log-structured file system", ACM Trans. Computer Systems
|
||||
10, 1, 26–52.
|
||||
|
||||
Wandering Tree Problem
|
||||
----------------------
|
||||
In LFS, when a file data is updated and written to the end of log, its direct
|
||||
pointer block is updated due to the changed location. Then the indirect pointer
|
||||
block is also updated due to the direct pointer block update. In this manner,
|
||||
the upper index structures such as inode, inode map, and checkpoint block are
|
||||
also updated recursively. This problem is called as wandering tree problem [1],
|
||||
and in order to enhance the performance, it should eliminate or relax the update
|
||||
propagation as much as possible.
|
||||
|
||||
[1] Bityutskiy, A. 2005. JFFS3 design issues. http://www.linux-mtd.infradead.org/
|
||||
|
||||
Cleaning Overhead
|
||||
-----------------
|
||||
Since LFS is based on out-of-place writes, it produces so many obsolete blocks
|
||||
scattered across the whole storage. In order to serve new empty log space, it
|
||||
needs to reclaim these obsolete blocks seamlessly to users. This job is called
|
||||
as a cleaning process.
|
||||
|
||||
The process consists of three operations as follows.
|
||||
1. A victim segment is selected through referencing segment usage table.
|
||||
2. It loads parent index structures of all the data in the victim identified by
|
||||
segment summary blocks.
|
||||
3. It checks the cross-reference between the data and its parent index structure.
|
||||
4. It moves valid data selectively.
|
||||
|
||||
This cleaning job may cause unexpected long delays, so the most important goal
|
||||
is to hide the latencies to users. And also definitely, it should reduce the
|
||||
amount of valid data to be moved, and move them quickly as well.
|
||||
|
||||
================================================================================
|
||||
KEY FEATURES
|
||||
================================================================================
|
||||
|
||||
Flash Awareness
|
||||
---------------
|
||||
- Enlarge the random write area for better performance, but provide the high
|
||||
spatial locality
|
||||
- Align FS data structures to the operational units in FTL as best efforts
|
||||
|
||||
Wandering Tree Problem
|
||||
----------------------
|
||||
- Use a term, “node”, that represents inodes as well as various pointer blocks
|
||||
- Introduce Node Address Table (NAT) containing the locations of all the “node”
|
||||
blocks; this will cut off the update propagation.
|
||||
|
||||
Cleaning Overhead
|
||||
-----------------
|
||||
- Support a background cleaning process
|
||||
- Support greedy and cost-benefit algorithms for victim selection policies
|
||||
- Support multi-head logs for static/dynamic hot and cold data separation
|
||||
- Introduce adaptive logging for efficient block allocation
|
||||
|
||||
================================================================================
|
||||
MOUNT OPTIONS
|
||||
================================================================================
|
||||
|
||||
background_gc=%s Turn on/off cleaning operations, namely garbage
|
||||
collection, triggered in background when I/O subsystem is
|
||||
idle. If background_gc=on, it will turn on the garbage
|
||||
collection and if background_gc=off, garbage collection
|
||||
will be turned off. If background_gc=sync, it will turn
|
||||
on synchronous garbage collection running in background.
|
||||
Default value for this option is on. So garbage
|
||||
collection is on by default.
|
||||
disable_roll_forward Disable the roll-forward recovery routine
|
||||
norecovery Disable the roll-forward recovery routine, mounted read-
|
||||
only (i.e., -o ro,disable_roll_forward)
|
||||
discard/nodiscard Enable/disable real-time discard in f2fs, if discard is
|
||||
enabled, f2fs will issue discard/TRIM commands when a
|
||||
segment is cleaned.
|
||||
no_heap Disable heap-style segment allocation which finds free
|
||||
segments for data from the beginning of main area, while
|
||||
for node from the end of main area.
|
||||
nouser_xattr Disable Extended User Attributes. Note: xattr is enabled
|
||||
by default if CONFIG_F2FS_FS_XATTR is selected.
|
||||
noacl Disable POSIX Access Control List. Note: acl is enabled
|
||||
by default if CONFIG_F2FS_FS_POSIX_ACL is selected.
|
||||
active_logs=%u Support configuring the number of active logs. In the
|
||||
current design, f2fs supports only 2, 4, and 6 logs.
|
||||
Default number is 6.
|
||||
disable_ext_identify Disable the extension list configured by mkfs, so f2fs
|
||||
does not aware of cold files such as media files.
|
||||
inline_xattr Enable the inline xattrs feature.
|
||||
noinline_xattr Disable the inline xattrs feature.
|
||||
inline_xattr_size=%u Support configuring inline xattr size, it depends on
|
||||
flexible inline xattr feature.
|
||||
inline_data Enable the inline data feature: New created small(<~3.4k)
|
||||
files can be written into inode block.
|
||||
inline_dentry Enable the inline dir feature: data in new created
|
||||
directory entries can be written into inode block. The
|
||||
space of inode block which is used to store inline
|
||||
dentries is limited to ~3.4k.
|
||||
noinline_dentry Disable the inline dentry feature.
|
||||
flush_merge Merge concurrent cache_flush commands as much as possible
|
||||
to eliminate redundant command issues. If the underlying
|
||||
device handles the cache_flush command relatively slowly,
|
||||
recommend to enable this option.
|
||||
nobarrier This option can be used if underlying storage guarantees
|
||||
its cached data should be written to the novolatile area.
|
||||
If this option is set, no cache_flush commands are issued
|
||||
but f2fs still guarantees the write ordering of all the
|
||||
data writes.
|
||||
fastboot This option is used when a system wants to reduce mount
|
||||
time as much as possible, even though normal performance
|
||||
can be sacrificed.
|
||||
extent_cache Enable an extent cache based on rb-tree, it can cache
|
||||
as many as extent which map between contiguous logical
|
||||
address and physical address per inode, resulting in
|
||||
increasing the cache hit ratio. Set by default.
|
||||
noextent_cache Disable an extent cache based on rb-tree explicitly, see
|
||||
the above extent_cache mount option.
|
||||
noinline_data Disable the inline data feature, inline data feature is
|
||||
enabled by default.
|
||||
data_flush Enable data flushing before checkpoint in order to
|
||||
persist data of regular and symlink.
|
||||
reserve_root=%d Support configuring reserved space which is used for
|
||||
allocation from a privileged user with specified uid or
|
||||
gid, unit: 4KB, the default limit is 0.2% of user blocks.
|
||||
resuid=%d The user ID which may use the reserved blocks.
|
||||
resgid=%d The group ID which may use the reserved blocks.
|
||||
fault_injection=%d Enable fault injection in all supported types with
|
||||
specified injection rate.
|
||||
fault_type=%d Support configuring fault injection type, should be
|
||||
enabled with fault_injection option, fault type value
|
||||
is shown below, it supports single or combined type.
|
||||
Type_Name Type_Value
|
||||
FAULT_KMALLOC 0x000000001
|
||||
FAULT_KVMALLOC 0x000000002
|
||||
FAULT_PAGE_ALLOC 0x000000004
|
||||
FAULT_PAGE_GET 0x000000008
|
||||
FAULT_ALLOC_BIO 0x000000010
|
||||
FAULT_ALLOC_NID 0x000000020
|
||||
FAULT_ORPHAN 0x000000040
|
||||
FAULT_BLOCK 0x000000080
|
||||
FAULT_DIR_DEPTH 0x000000100
|
||||
FAULT_EVICT_INODE 0x000000200
|
||||
FAULT_TRUNCATE 0x000000400
|
||||
FAULT_READ_IO 0x000000800
|
||||
FAULT_CHECKPOINT 0x000001000
|
||||
FAULT_DISCARD 0x000002000
|
||||
FAULT_WRITE_IO 0x000004000
|
||||
mode=%s Control block allocation mode which supports "adaptive"
|
||||
and "lfs". In "lfs" mode, there should be no random
|
||||
writes towards main area.
|
||||
io_bits=%u Set the bit size of write IO requests. It should be set
|
||||
with "mode=lfs".
|
||||
usrquota Enable plain user disk quota accounting.
|
||||
grpquota Enable plain group disk quota accounting.
|
||||
prjquota Enable plain project quota accounting.
|
||||
usrjquota=<file> Appoint specified file and type during mount, so that quota
|
||||
grpjquota=<file> information can be properly updated during recovery flow,
|
||||
prjjquota=<file> <quota file>: must be in root directory;
|
||||
jqfmt=<quota type> <quota type>: [vfsold,vfsv0,vfsv1].
|
||||
offusrjquota Turn off user journelled quota.
|
||||
offgrpjquota Turn off group journelled quota.
|
||||
offprjjquota Turn off project journelled quota.
|
||||
quota Enable plain user disk quota accounting.
|
||||
noquota Disable all plain disk quota option.
|
||||
whint_mode=%s Control which write hints are passed down to block
|
||||
layer. This supports "off", "user-based", and
|
||||
"fs-based". In "off" mode (default), f2fs does not pass
|
||||
down hints. In "user-based" mode, f2fs tries to pass
|
||||
down hints given by users. And in "fs-based" mode, f2fs
|
||||
passes down hints with its policy.
|
||||
alloc_mode=%s Adjust block allocation policy, which supports "reuse"
|
||||
and "default".
|
||||
fsync_mode=%s Control the policy of fsync. Currently supports "posix",
|
||||
"strict", and "nobarrier". In "posix" mode, which is
|
||||
default, fsync will follow POSIX semantics and does a
|
||||
light operation to improve the filesystem performance.
|
||||
In "strict" mode, fsync will be heavy and behaves in line
|
||||
with xfs, ext4 and btrfs, where xfstest generic/342 will
|
||||
pass, but the performance will regress. "nobarrier" is
|
||||
based on "posix", but doesn't issue flush command for
|
||||
non-atomic files likewise "nobarrier" mount option.
|
||||
test_dummy_encryption
|
||||
test_dummy_encryption=%s
|
||||
Enable dummy encryption, which provides a fake fscrypt
|
||||
context. The fake fscrypt context is used by xfstests.
|
||||
The argument may be either "v1" or "v2", in order to
|
||||
select the corresponding fscrypt policy version.
|
||||
checkpoint=%s[:%u[%]] Set to "disable" to turn off checkpointing. Set to "enable"
|
||||
to reenable checkpointing. Is enabled by default. While
|
||||
disabled, any unmounting or unexpected shutdowns will cause
|
||||
the filesystem contents to appear as they did when the
|
||||
filesystem was mounted with that option.
|
||||
While mounting with checkpoint=disabled, the filesystem must
|
||||
run garbage collection to ensure that all available space can
|
||||
be used. If this takes too much time, the mount may return
|
||||
EAGAIN. You may optionally add a value to indicate how much
|
||||
of the disk you would be willing to temporarily give up to
|
||||
avoid additional garbage collection. This can be given as a
|
||||
number of blocks, or as a percent. For instance, mounting
|
||||
with checkpoint=disable:100% would always succeed, but it may
|
||||
hide up to all remaining free space. The actual space that
|
||||
would be unusable can be viewed at /sys/fs/f2fs/<disk>/unusable
|
||||
This space is reclaimed once checkpoint=enable.
|
||||
compress_algorithm=%s Control compress algorithm, currently f2fs supports "lzo",
|
||||
"lz4" and "zstd" algorithm.
|
||||
compress_log_size=%u Support configuring compress cluster size, the size will
|
||||
be 4KB * (1 << %u), 16KB is minimum size, also it's
|
||||
default size.
|
||||
compress_extension=%s Support adding specified extension, so that f2fs can enable
|
||||
compression on those corresponding files, e.g. if all files
|
||||
with '.ext' has high compression rate, we can set the '.ext'
|
||||
on compression extension list and enable compression on
|
||||
these file by default rather than to enable it via ioctl.
|
||||
For other files, we can still enable compression via ioctl.
|
||||
|
||||
================================================================================
|
||||
DEBUGFS ENTRIES
|
||||
================================================================================
|
||||
|
||||
/sys/kernel/debug/f2fs/ contains information about all the partitions mounted as
|
||||
f2fs. Each file shows the whole f2fs information.
|
||||
|
||||
/sys/kernel/debug/f2fs/status includes:
|
||||
- major file system information managed by f2fs currently
|
||||
- average SIT information about whole segments
|
||||
- current memory footprint consumed by f2fs.
|
||||
|
||||
================================================================================
|
||||
SYSFS ENTRIES
|
||||
================================================================================
|
||||
|
||||
Information about mounted f2fs file systems can be found in
|
||||
/sys/fs/f2fs. Each mounted filesystem will have a directory in
|
||||
/sys/fs/f2fs based on its device name (i.e., /sys/fs/f2fs/sda).
|
||||
The files in each per-device directory are shown in table below.
|
||||
|
||||
Files in /sys/fs/f2fs/<devname>
|
||||
(see also Documentation/ABI/testing/sysfs-fs-f2fs)
|
||||
|
||||
================================================================================
|
||||
USAGE
|
||||
================================================================================
|
||||
|
||||
1. Download userland tools and compile them.
|
||||
|
||||
2. Skip, if f2fs was compiled statically inside kernel.
|
||||
Otherwise, insert the f2fs.ko module.
|
||||
# insmod f2fs.ko
|
||||
|
||||
3. Create a directory trying to mount
|
||||
# mkdir /mnt/f2fs
|
||||
|
||||
4. Format the block device, and then mount as f2fs
|
||||
# mkfs.f2fs -l label /dev/block_device
|
||||
# mount -t f2fs /dev/block_device /mnt/f2fs
|
||||
|
||||
mkfs.f2fs
|
||||
---------
|
||||
The mkfs.f2fs is for the use of formatting a partition as the f2fs filesystem,
|
||||
which builds a basic on-disk layout.
|
||||
|
||||
The options consist of:
|
||||
-l [label] : Give a volume label, up to 512 unicode name.
|
||||
-a [0 or 1] : Split start location of each area for heap-based allocation.
|
||||
1 is set by default, which performs this.
|
||||
-o [int] : Set overprovision ratio in percent over volume size.
|
||||
5 is set by default.
|
||||
-s [int] : Set the number of segments per section.
|
||||
1 is set by default.
|
||||
-z [int] : Set the number of sections per zone.
|
||||
1 is set by default.
|
||||
-e [str] : Set basic extension list. e.g. "mp3,gif,mov"
|
||||
-t [0 or 1] : Disable discard command or not.
|
||||
1 is set by default, which conducts discard.
|
||||
|
||||
fsck.f2fs
|
||||
---------
|
||||
The fsck.f2fs is a tool to check the consistency of an f2fs-formatted
|
||||
partition, which examines whether the filesystem metadata and user-made data
|
||||
are cross-referenced correctly or not.
|
||||
Note that, initial version of the tool does not fix any inconsistency.
|
||||
|
||||
The options consist of:
|
||||
-d debug level [default:0]
|
||||
|
||||
dump.f2fs
|
||||
---------
|
||||
The dump.f2fs shows the information of specific inode and dumps SSA and SIT to
|
||||
file. Each file is dump_ssa and dump_sit.
|
||||
|
||||
The dump.f2fs is used to debug on-disk data structures of the f2fs filesystem.
|
||||
It shows on-disk inode information recognized by a given inode number, and is
|
||||
able to dump all the SSA and SIT entries into predefined files, ./dump_ssa and
|
||||
./dump_sit respectively.
|
||||
|
||||
The options consist of:
|
||||
-d debug level [default:0]
|
||||
-i inode no (hex)
|
||||
-s [SIT dump segno from #1~#2 (decimal), for all 0~-1]
|
||||
-a [SSA dump segno from #1~#2 (decimal), for all 0~-1]
|
||||
|
||||
Examples:
|
||||
# dump.f2fs -i [ino] /dev/sdx
|
||||
# dump.f2fs -s 0~-1 /dev/sdx (SIT dump)
|
||||
# dump.f2fs -a 0~-1 /dev/sdx (SSA dump)
|
||||
|
||||
================================================================================
|
||||
DESIGN
|
||||
================================================================================
|
||||
|
||||
On-disk Layout
|
||||
--------------
|
||||
|
||||
F2FS divides the whole volume into a number of segments, each of which is fixed
|
||||
to 2MB in size. A section is composed of consecutive segments, and a zone
|
||||
consists of a set of sections. By default, section and zone sizes are set to one
|
||||
segment size identically, but users can easily modify the sizes by mkfs.
|
||||
|
||||
F2FS splits the entire volume into six areas, and all the areas except superblock
|
||||
consists of multiple segments as described below.
|
||||
|
||||
align with the zone size <-|
|
||||
|-> align with the segment size
|
||||
_________________________________________________________________________
|
||||
| | | Segment | Node | Segment | |
|
||||
| Superblock | Checkpoint | Info. | Address | Summary | Main |
|
||||
| (SB) | (CP) | Table (SIT) | Table (NAT) | Area (SSA) | |
|
||||
|____________|_____2______|______N______|______N______|______N_____|__N___|
|
||||
. .
|
||||
. .
|
||||
. .
|
||||
._________________________________________.
|
||||
|_Segment_|_..._|_Segment_|_..._|_Segment_|
|
||||
. .
|
||||
._________._________
|
||||
|_section_|__...__|_
|
||||
. .
|
||||
.________.
|
||||
|__zone__|
|
||||
|
||||
- Superblock (SB)
|
||||
: It is located at the beginning of the partition, and there exist two copies
|
||||
to avoid file system crash. It contains basic partition information and some
|
||||
default parameters of f2fs.
|
||||
|
||||
- Checkpoint (CP)
|
||||
: It contains file system information, bitmaps for valid NAT/SIT sets, orphan
|
||||
inode lists, and summary entries of current active segments.
|
||||
|
||||
- Segment Information Table (SIT)
|
||||
: It contains segment information such as valid block count and bitmap for the
|
||||
validity of all the blocks.
|
||||
|
||||
- Node Address Table (NAT)
|
||||
: It is composed of a block address table for all the node blocks stored in
|
||||
Main area.
|
||||
|
||||
- Segment Summary Area (SSA)
|
||||
: It contains summary entries which contains the owner information of all the
|
||||
data and node blocks stored in Main area.
|
||||
|
||||
- Main Area
|
||||
: It contains file and directory data including their indices.
|
||||
|
||||
In order to avoid misalignment between file system and flash-based storage, F2FS
|
||||
aligns the start block address of CP with the segment size. Also, it aligns the
|
||||
start block address of Main area with the zone size by reserving some segments
|
||||
in SSA area.
|
||||
|
||||
Reference the following survey for additional technical details.
|
||||
https://wiki.linaro.org/WorkingGroups/Kernel/Projects/FlashCardSurvey
|
||||
|
||||
File System Metadata Structure
|
||||
------------------------------
|
||||
|
||||
F2FS adopts the checkpointing scheme to maintain file system consistency. At
|
||||
mount time, F2FS first tries to find the last valid checkpoint data by scanning
|
||||
CP area. In order to reduce the scanning time, F2FS uses only two copies of CP.
|
||||
One of them always indicates the last valid data, which is called as shadow copy
|
||||
mechanism. In addition to CP, NAT and SIT also adopt the shadow copy mechanism.
|
||||
|
||||
For file system consistency, each CP points to which NAT and SIT copies are
|
||||
valid, as shown as below.
|
||||
|
||||
+--------+----------+---------+
|
||||
| CP | SIT | NAT |
|
||||
+--------+----------+---------+
|
||||
. . . .
|
||||
. . . .
|
||||
. . . .
|
||||
+-------+-------+--------+--------+--------+--------+
|
||||
| CP #0 | CP #1 | SIT #0 | SIT #1 | NAT #0 | NAT #1 |
|
||||
+-------+-------+--------+--------+--------+--------+
|
||||
| ^ ^
|
||||
| | |
|
||||
`----------------------------------------'
|
||||
|
||||
Index Structure
|
||||
---------------
|
||||
|
||||
The key data structure to manage the data locations is a "node". Similar to
|
||||
traditional file structures, F2FS has three types of node: inode, direct node,
|
||||
indirect node. F2FS assigns 4KB to an inode block which contains 923 data block
|
||||
indices, two direct node pointers, two indirect node pointers, and one double
|
||||
indirect node pointer as described below. One direct node block contains 1018
|
||||
data blocks, and one indirect node block contains also 1018 node blocks. Thus,
|
||||
one inode block (i.e., a file) covers:
|
||||
|
||||
4KB * (923 + 2 * 1018 + 2 * 1018 * 1018 + 1018 * 1018 * 1018) := 3.94TB.
|
||||
|
||||
Inode block (4KB)
|
||||
|- data (923)
|
||||
|- direct node (2)
|
||||
| `- data (1018)
|
||||
|- indirect node (2)
|
||||
| `- direct node (1018)
|
||||
| `- data (1018)
|
||||
`- double indirect node (1)
|
||||
`- indirect node (1018)
|
||||
`- direct node (1018)
|
||||
`- data (1018)
|
||||
|
||||
Note that, all the node blocks are mapped by NAT which means the location of
|
||||
each node is translated by the NAT table. In the consideration of the wandering
|
||||
tree problem, F2FS is able to cut off the propagation of node updates caused by
|
||||
leaf data writes.
|
||||
|
||||
Directory Structure
|
||||
-------------------
|
||||
|
||||
A directory entry occupies 11 bytes, which consists of the following attributes.
|
||||
|
||||
- hash hash value of the file name
|
||||
- ino inode number
|
||||
- len the length of file name
|
||||
- type file type such as directory, symlink, etc
|
||||
|
||||
A dentry block consists of 214 dentry slots and file names. Therein a bitmap is
|
||||
used to represent whether each dentry is valid or not. A dentry block occupies
|
||||
4KB with the following composition.
|
||||
|
||||
Dentry Block(4 K) = bitmap (27 bytes) + reserved (3 bytes) +
|
||||
dentries(11 * 214 bytes) + file name (8 * 214 bytes)
|
||||
|
||||
[Bucket]
|
||||
+--------------------------------+
|
||||
|dentry block 1 | dentry block 2 |
|
||||
+--------------------------------+
|
||||
. .
|
||||
. .
|
||||
. [Dentry Block Structure: 4KB] .
|
||||
+--------+----------+----------+------------+
|
||||
| bitmap | reserved | dentries | file names |
|
||||
+--------+----------+----------+------------+
|
||||
[Dentry Block: 4KB] . .
|
||||
. .
|
||||
. .
|
||||
+------+------+-----+------+
|
||||
| hash | ino | len | type |
|
||||
+------+------+-----+------+
|
||||
[Dentry Structure: 11 bytes]
|
||||
|
||||
F2FS implements multi-level hash tables for directory structure. Each level has
|
||||
a hash table with dedicated number of hash buckets as shown below. Note that
|
||||
"A(2B)" means a bucket includes 2 data blocks.
|
||||
|
||||
----------------------
|
||||
A : bucket
|
||||
B : block
|
||||
N : MAX_DIR_HASH_DEPTH
|
||||
----------------------
|
||||
|
||||
level #0 | A(2B)
|
||||
|
|
||||
level #1 | A(2B) - A(2B)
|
||||
|
|
||||
level #2 | A(2B) - A(2B) - A(2B) - A(2B)
|
||||
. | . . . .
|
||||
level #N/2 | A(2B) - A(2B) - A(2B) - A(2B) - A(2B) - ... - A(2B)
|
||||
. | . . . .
|
||||
level #N | A(4B) - A(4B) - A(4B) - A(4B) - A(4B) - ... - A(4B)
|
||||
|
||||
The number of blocks and buckets are determined by,
|
||||
|
||||
,- 2, if n < MAX_DIR_HASH_DEPTH / 2,
|
||||
# of blocks in level #n = |
|
||||
`- 4, Otherwise
|
||||
|
||||
,- 2^(n + dir_level),
|
||||
| if n + dir_level < MAX_DIR_HASH_DEPTH / 2,
|
||||
# of buckets in level #n = |
|
||||
`- 2^((MAX_DIR_HASH_DEPTH / 2) - 1),
|
||||
Otherwise
|
||||
|
||||
When F2FS finds a file name in a directory, at first a hash value of the file
|
||||
name is calculated. Then, F2FS scans the hash table in level #0 to find the
|
||||
dentry consisting of the file name and its inode number. If not found, F2FS
|
||||
scans the next hash table in level #1. In this way, F2FS scans hash tables in
|
||||
each levels incrementally from 1 to N. In each levels F2FS needs to scan only
|
||||
one bucket determined by the following equation, which shows O(log(# of files))
|
||||
complexity.
|
||||
|
||||
bucket number to scan in level #n = (hash value) % (# of buckets in level #n)
|
||||
|
||||
In the case of file creation, F2FS finds empty consecutive slots that cover the
|
||||
file name. F2FS searches the empty slots in the hash tables of whole levels from
|
||||
1 to N in the same way as the lookup operation.
|
||||
|
||||
The following figure shows an example of two cases holding children.
|
||||
--------------> Dir <--------------
|
||||
| |
|
||||
child child
|
||||
|
||||
child - child [hole] - child
|
||||
|
||||
child - child - child [hole] - [hole] - child
|
||||
|
||||
Case 1: Case 2:
|
||||
Number of children = 6, Number of children = 3,
|
||||
File size = 7 File size = 7
|
||||
|
||||
Default Block Allocation
|
||||
------------------------
|
||||
|
||||
At runtime, F2FS manages six active logs inside "Main" area: Hot/Warm/Cold node
|
||||
and Hot/Warm/Cold data.
|
||||
|
||||
- Hot node contains direct node blocks of directories.
|
||||
- Warm node contains direct node blocks except hot node blocks.
|
||||
- Cold node contains indirect node blocks
|
||||
- Hot data contains dentry blocks
|
||||
- Warm data contains data blocks except hot and cold data blocks
|
||||
- Cold data contains multimedia data or migrated data blocks
|
||||
|
||||
LFS has two schemes for free space management: threaded log and copy-and-compac-
|
||||
tion. The copy-and-compaction scheme which is known as cleaning, is well-suited
|
||||
for devices showing very good sequential write performance, since free segments
|
||||
are served all the time for writing new data. However, it suffers from cleaning
|
||||
overhead under high utilization. Contrarily, the threaded log scheme suffers
|
||||
from random writes, but no cleaning process is needed. F2FS adopts a hybrid
|
||||
scheme where the copy-and-compaction scheme is adopted by default, but the
|
||||
policy is dynamically changed to the threaded log scheme according to the file
|
||||
system status.
|
||||
|
||||
In order to align F2FS with underlying flash-based storage, F2FS allocates a
|
||||
segment in a unit of section. F2FS expects that the section size would be the
|
||||
same as the unit size of garbage collection in FTL. Furthermore, with respect
|
||||
to the mapping granularity in FTL, F2FS allocates each section of the active
|
||||
logs from different zones as much as possible, since FTL can write the data in
|
||||
the active logs into one allocation unit according to its mapping granularity.
|
||||
|
||||
Cleaning process
|
||||
----------------
|
||||
|
||||
F2FS does cleaning both on demand and in the background. On-demand cleaning is
|
||||
triggered when there are not enough free segments to serve VFS calls. Background
|
||||
cleaner is operated by a kernel thread, and triggers the cleaning job when the
|
||||
system is idle.
|
||||
|
||||
F2FS supports two victim selection policies: greedy and cost-benefit algorithms.
|
||||
In the greedy algorithm, F2FS selects a victim segment having the smallest number
|
||||
of valid blocks. In the cost-benefit algorithm, F2FS selects a victim segment
|
||||
according to the segment age and the number of valid blocks in order to address
|
||||
log block thrashing problem in the greedy algorithm. F2FS adopts the greedy
|
||||
algorithm for on-demand cleaner, while background cleaner adopts cost-benefit
|
||||
algorithm.
|
||||
|
||||
In order to identify whether the data in the victim segment are valid or not,
|
||||
F2FS manages a bitmap. Each bit represents the validity of a block, and the
|
||||
bitmap is composed of a bit stream covering whole blocks in main area.
|
||||
|
||||
Fallocate(2) Policy
|
||||
-------------------
|
||||
|
||||
The default policy follows the below posix rule.
|
||||
|
||||
Allocating disk space
|
||||
The default operation (i.e., mode is zero) of fallocate() allocates
|
||||
the disk space within the range specified by offset and len. The
|
||||
file size (as reported by stat(2)) will be changed if offset+len is
|
||||
greater than the file size. Any subregion within the range specified
|
||||
by offset and len that did not contain data before the call will be
|
||||
initialized to zero. This default behavior closely resembles the
|
||||
behavior of the posix_fallocate(3) library function, and is intended
|
||||
as a method of optimally implementing that function.
|
||||
|
||||
However, once F2FS receives ioctl(fd, F2FS_IOC_SET_PIN_FILE) in prior to
|
||||
fallocate(fd, DEFAULT_MODE), it allocates on-disk blocks addressess having
|
||||
zero or random data, which is useful to the below scenario where:
|
||||
1. create(fd)
|
||||
2. ioctl(fd, F2FS_IOC_SET_PIN_FILE)
|
||||
3. fallocate(fd, 0, 0, size)
|
||||
4. address = fibmap(fd, offset)
|
||||
5. open(blkdev)
|
||||
6. write(blkdev, address)
|
||||
|
||||
Compression implementation
|
||||
--------------------------
|
||||
|
||||
- New term named cluster is defined as basic unit of compression, file can
|
||||
be divided into multiple clusters logically. One cluster includes 4 << n
|
||||
(n >= 0) logical pages, compression size is also cluster size, each of
|
||||
cluster can be compressed or not.
|
||||
|
||||
- In cluster metadata layout, one special block address is used to indicate
|
||||
cluster is compressed one or normal one, for compressed cluster, following
|
||||
metadata maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs
|
||||
stores data including compress header and compressed data.
|
||||
|
||||
- In order to eliminate write amplification during overwrite, F2FS only
|
||||
support compression on write-once file, data can be compressed only when
|
||||
all logical blocks in file are valid and cluster compress ratio is lower
|
||||
than specified threshold.
|
||||
|
||||
- To enable compression on regular inode, there are three ways:
|
||||
* chattr +c file
|
||||
* chattr +c dir; touch dir/file
|
||||
* mount w/ -o compress_extension=ext; touch file.ext
|
||||
|
||||
Compress metadata layout:
|
||||
[Dnode Structure]
|
||||
+-----------------------------------------------+
|
||||
| cluster 1 | cluster 2 | ......... | cluster N |
|
||||
+-----------------------------------------------+
|
||||
. . . .
|
||||
. . . .
|
||||
. Compressed Cluster . . Normal Cluster .
|
||||
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|
||||
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
|
||||
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|
||||
. .
|
||||
. .
|
||||
. .
|
||||
+-------------+-------------+----------+----------------------------+
|
||||
| data length | data chksum | reserved | compressed data |
|
||||
+-------------+-------------+----------+----------------------------+
|
@ -27,9 +27,9 @@ automatically verified against the file's Merkle tree. Reads of any
|
||||
corrupted data, including mmap reads, will fail.
|
||||
|
||||
Userspace can use another ioctl to retrieve the root hash (actually
|
||||
the "file measurement", which is a hash that includes the root hash)
|
||||
that fs-verity is enforcing for the file. This ioctl executes in
|
||||
constant time, regardless of the file size.
|
||||
the "fs-verity file digest", which is a hash that includes the Merkle
|
||||
tree root hash) that fs-verity is enforcing for the file. This ioctl
|
||||
executes in constant time, regardless of the file size.
|
||||
|
||||
fs-verity is essentially a way to hash a file in constant time,
|
||||
subject to the caveat that reads which would violate the hash will
|
||||
@ -177,9 +177,10 @@ FS_IOC_ENABLE_VERITY can fail with the following errors:
|
||||
FS_IOC_MEASURE_VERITY
|
||||
---------------------
|
||||
|
||||
The FS_IOC_MEASURE_VERITY ioctl retrieves the measurement of a verity
|
||||
file. The file measurement is a digest that cryptographically
|
||||
identifies the file contents that are being enforced on reads.
|
||||
The FS_IOC_MEASURE_VERITY ioctl retrieves the digest of a verity file.
|
||||
The fs-verity file digest is a cryptographic digest that identifies
|
||||
the file contents that are being enforced on reads; it is computed via
|
||||
a Merkle tree and is different from a traditional full-file digest.
|
||||
|
||||
This ioctl takes in a pointer to a variable-length structure::
|
||||
|
||||
@ -197,7 +198,7 @@ On success, 0 is returned and the kernel fills in the structure as
|
||||
follows:
|
||||
|
||||
- ``digest_algorithm`` will be the hash algorithm used for the file
|
||||
measurement. It will match ``fsverity_enable_arg::hash_algorithm``.
|
||||
digest. It will match ``fsverity_enable_arg::hash_algorithm``.
|
||||
- ``digest_size`` will be the size of the digest in bytes, e.g. 32
|
||||
for SHA-256. (This can be redundant with ``digest_algorithm``.)
|
||||
- ``digest`` will be the actual bytes of the digest.
|
||||
@ -216,6 +217,82 @@ FS_IOC_MEASURE_VERITY can fail with the following errors:
|
||||
- ``EOVERFLOW``: the digest is longer than the specified
|
||||
``digest_size`` bytes. Try providing a larger buffer.
|
||||
|
||||
FS_IOC_READ_VERITY_METADATA
|
||||
---------------------------
|
||||
|
||||
The FS_IOC_READ_VERITY_METADATA ioctl reads verity metadata from a
|
||||
verity file. This ioctl is available since Linux v5.12.
|
||||
|
||||
This ioctl allows writing a server program that takes a verity file
|
||||
and serves it to a client program, such that the client can do its own
|
||||
fs-verity compatible verification of the file. This only makes sense
|
||||
if the client doesn't trust the server and if the server needs to
|
||||
provide the storage for the client.
|
||||
|
||||
This is a fairly specialized use case, and most fs-verity users won't
|
||||
need this ioctl.
|
||||
|
||||
This ioctl takes in a pointer to the following structure::
|
||||
|
||||
#define FS_VERITY_METADATA_TYPE_MERKLE_TREE 1
|
||||
#define FS_VERITY_METADATA_TYPE_DESCRIPTOR 2
|
||||
#define FS_VERITY_METADATA_TYPE_SIGNATURE 3
|
||||
|
||||
struct fsverity_read_metadata_arg {
|
||||
__u64 metadata_type;
|
||||
__u64 offset;
|
||||
__u64 length;
|
||||
__u64 buf_ptr;
|
||||
__u64 __reserved;
|
||||
};
|
||||
|
||||
``metadata_type`` specifies the type of metadata to read:
|
||||
|
||||
- ``FS_VERITY_METADATA_TYPE_MERKLE_TREE`` reads the blocks of the
|
||||
Merkle tree. The blocks are returned in order from the root level
|
||||
to the leaf level. Within each level, the blocks are returned in
|
||||
the same order that their hashes are themselves hashed.
|
||||
See `Merkle tree`_ for more information.
|
||||
|
||||
- ``FS_VERITY_METADATA_TYPE_DESCRIPTOR`` reads the fs-verity
|
||||
descriptor. See `fs-verity descriptor`_.
|
||||
|
||||
- ``FS_VERITY_METADATA_TYPE_SIGNATURE`` reads the signature which was
|
||||
passed to FS_IOC_ENABLE_VERITY, if any. See `Built-in signature
|
||||
verification`_.
|
||||
|
||||
The semantics are similar to those of ``pread()``. ``offset``
|
||||
specifies the offset in bytes into the metadata item to read from, and
|
||||
``length`` specifies the maximum number of bytes to read from the
|
||||
metadata item. ``buf_ptr`` is the pointer to the buffer to read into,
|
||||
cast to a 64-bit integer. ``__reserved`` must be 0. On success, the
|
||||
number of bytes read is returned. 0 is returned at the end of the
|
||||
metadata item. The returned length may be less than ``length``, for
|
||||
example if the ioctl is interrupted.
|
||||
|
||||
The metadata returned by FS_IOC_READ_VERITY_METADATA isn't guaranteed
|
||||
to be authenticated against the file digest that would be returned by
|
||||
`FS_IOC_MEASURE_VERITY`_, as the metadata is expected to be used to
|
||||
implement fs-verity compatible verification anyway (though absent a
|
||||
malicious disk, the metadata will indeed match). E.g. to implement
|
||||
this ioctl, the filesystem is allowed to just read the Merkle tree
|
||||
blocks from disk without actually verifying the path to the root node.
|
||||
|
||||
FS_IOC_READ_VERITY_METADATA can fail with the following errors:
|
||||
|
||||
- ``EFAULT``: the caller provided inaccessible memory
|
||||
- ``EINTR``: the ioctl was interrupted before any data was read
|
||||
- ``EINVAL``: reserved fields were set, or ``offset + length``
|
||||
overflowed
|
||||
- ``ENODATA``: the file is not a verity file, or
|
||||
FS_VERITY_METADATA_TYPE_SIGNATURE was requested but the file doesn't
|
||||
have a built-in signature
|
||||
- ``ENOTTY``: this type of filesystem does not implement fs-verity, or
|
||||
this ioctl is not yet implemented on it
|
||||
- ``EOPNOTSUPP``: the kernel was not configured with fs-verity
|
||||
support, or the filesystem superblock has not had the 'verity'
|
||||
feature enabled on it. (See `Filesystem support`_.)
|
||||
|
||||
FS_IOC_GETFLAGS
|
||||
---------------
|
||||
|
||||
@ -257,25 +334,24 @@ non-verity one, with the following exceptions:
|
||||
with EIO (for read()) or SIGBUS (for mmap() reads).
|
||||
|
||||
- If the sysctl "fs.verity.require_signatures" is set to 1 and the
|
||||
file's verity measurement is not signed by a key in the fs-verity
|
||||
keyring, then opening the file will fail. See `Built-in signature
|
||||
verification`_.
|
||||
file is not signed by a key in the fs-verity keyring, then opening
|
||||
the file will fail. See `Built-in signature verification`_.
|
||||
|
||||
Direct access to the Merkle tree is not supported. Therefore, if a
|
||||
verity file is copied, or is backed up and restored, then it will lose
|
||||
its "verity"-ness. fs-verity is primarily meant for files like
|
||||
executables that are managed by a package manager.
|
||||
|
||||
File measurement computation
|
||||
============================
|
||||
File digest computation
|
||||
=======================
|
||||
|
||||
This section describes how fs-verity hashes the file contents using a
|
||||
Merkle tree to produce the "file measurement" which cryptographically
|
||||
identifies the file contents. This algorithm is the same for all
|
||||
filesystems that support fs-verity.
|
||||
Merkle tree to produce the digest which cryptographically identifies
|
||||
the file contents. This algorithm is the same for all filesystems
|
||||
that support fs-verity.
|
||||
|
||||
Userspace only needs to be aware of this algorithm if it needs to
|
||||
compute the file measurement itself, e.g. in order to sign the file.
|
||||
compute fs-verity file digests itself, e.g. in order to sign files.
|
||||
|
||||
.. _fsverity_merkle_tree:
|
||||
|
||||
@ -325,26 +401,22 @@ can't a distinguish a large file from a small second file whose data
|
||||
is exactly the top-level hash block of the first file. Ambiguities
|
||||
also arise from the convention of padding to the next block boundary.
|
||||
|
||||
To solve this problem, the verity file measurement is actually
|
||||
computed as a hash of the following structure, which contains the
|
||||
Merkle tree root hash as well as other fields such as the file size::
|
||||
To solve this problem, the fs-verity file digest is actually computed
|
||||
as a hash of the following structure, which contains the Merkle tree
|
||||
root hash as well as other fields such as the file size::
|
||||
|
||||
struct fsverity_descriptor {
|
||||
__u8 version; /* must be 1 */
|
||||
__u8 hash_algorithm; /* Merkle tree hash algorithm */
|
||||
__u8 log_blocksize; /* log2 of size of data and tree blocks */
|
||||
__u8 salt_size; /* size of salt in bytes; 0 if none */
|
||||
__le32 sig_size; /* must be 0 */
|
||||
__le32 __reserved_0x04; /* must be 0 */
|
||||
__le64 data_size; /* size of file the Merkle tree is built over */
|
||||
__u8 root_hash[64]; /* Merkle tree root hash */
|
||||
__u8 salt[32]; /* salt prepended to each hashed block */
|
||||
__u8 __reserved[144]; /* must be 0's */
|
||||
};
|
||||
|
||||
Note that the ``sig_size`` field must be set to 0 for the purpose of
|
||||
computing the file measurement, even if a signature was provided (or
|
||||
will be provided) to `FS_IOC_ENABLE_VERITY`_.
|
||||
|
||||
Built-in signature verification
|
||||
===============================
|
||||
|
||||
@ -359,20 +431,20 @@ kernel. Specifically, it adds support for:
|
||||
certificates from being added.
|
||||
|
||||
2. `FS_IOC_ENABLE_VERITY`_ accepts a pointer to a PKCS#7 formatted
|
||||
detached signature in DER format of the file measurement. On
|
||||
success, this signature is persisted alongside the Merkle tree.
|
||||
detached signature in DER format of the file's fs-verity digest.
|
||||
On success, this signature is persisted alongside the Merkle tree.
|
||||
Then, any time the file is opened, the kernel will verify the
|
||||
file's actual measurement against this signature, using the
|
||||
certificates in the ".fs-verity" keyring.
|
||||
file's actual digest against this signature, using the certificates
|
||||
in the ".fs-verity" keyring.
|
||||
|
||||
3. A new sysctl "fs.verity.require_signatures" is made available.
|
||||
When set to 1, the kernel requires that all verity files have a
|
||||
correctly signed file measurement as described in (2).
|
||||
correctly signed digest as described in (2).
|
||||
|
||||
File measurements must be signed in the following format, which is
|
||||
similar to the structure used by `FS_IOC_MEASURE_VERITY`_::
|
||||
fs-verity file digests must be signed in the following format, which
|
||||
is similar to the structure used by `FS_IOC_MEASURE_VERITY`_::
|
||||
|
||||
struct fsverity_signed_digest {
|
||||
struct fsverity_formatted_digest {
|
||||
char magic[8]; /* must be "FSVerity" */
|
||||
__le16 digest_algorithm;
|
||||
__le16 digest_size;
|
||||
@ -421,8 +493,8 @@ can only be set by `FS_IOC_ENABLE_VERITY`_, and it cannot be cleared.
|
||||
|
||||
ext4 also supports encryption, which can be used simultaneously with
|
||||
fs-verity. In this case, the plaintext data is verified rather than
|
||||
the ciphertext. This is necessary in order to make the file
|
||||
measurement meaningful, since every file is encrypted differently.
|
||||
the ciphertext. This is necessary in order to make the fs-verity file
|
||||
digest meaningful, since every file is encrypted differently.
|
||||
|
||||
ext4 stores the verity metadata (Merkle tree and fsverity_descriptor)
|
||||
past the end of the file, starting at the first 64K boundary beyond
|
||||
@ -592,8 +664,8 @@ weren't already directly answered in other parts of this document.
|
||||
:Q: Isn't fs-verity useless because the attacker can just modify the
|
||||
hashes in the Merkle tree, which is stored on-disk?
|
||||
:A: To verify the authenticity of an fs-verity file you must verify
|
||||
the authenticity of the "file measurement", which is basically the
|
||||
root hash of the Merkle tree. See `Use cases`_.
|
||||
the authenticity of the "fs-verity file digest", which
|
||||
incorporates the root hash of the Merkle tree. See `Use cases`_.
|
||||
|
||||
:Q: Isn't fs-verity useless because the attacker can just replace a
|
||||
verity file with a non-verity one?
|
||||
|
@ -5277,6 +5277,7 @@ F: Documentation/ABI/testing/sysfs-fs-f2fs
|
||||
F: fs/f2fs/
|
||||
F: include/linux/f2fs_fs.h
|
||||
F: include/trace/events/f2fs.h
|
||||
F: include/uapi/linux/f2fs.h
|
||||
|
||||
F71805F HARDWARE MONITORING DRIVER
|
||||
M: Jean Delvare <jdelvare@suse.com>
|
||||
|
@ -517,6 +517,7 @@ static int dax_open(struct inode *inode, struct file *filp)
|
||||
inode->i_mapping->host = __dax_inode;
|
||||
filp->f_mapping = inode->i_mapping;
|
||||
filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
|
||||
filp->f_sb_err = file_sample_sb_err(filp);
|
||||
filp->private_data = dev_dax;
|
||||
inode->i_flags = S_DAX;
|
||||
|
||||
|
@ -1142,12 +1142,19 @@ EXPORT_SYMBOL(mark_buffer_dirty);
|
||||
|
||||
void mark_buffer_write_io_error(struct buffer_head *bh)
|
||||
{
|
||||
struct super_block *sb;
|
||||
|
||||
set_buffer_write_io_error(bh);
|
||||
/* FIXME: do we need to set this in both places? */
|
||||
if (bh->b_page && bh->b_page->mapping)
|
||||
mapping_set_error(bh->b_page->mapping, -EIO);
|
||||
if (bh->b_assoc_map)
|
||||
mapping_set_error(bh->b_assoc_map, -EIO);
|
||||
rcu_read_lock();
|
||||
sb = READ_ONCE(bh->b_bdev->bd_super);
|
||||
if (sb)
|
||||
errseq_set(&sb->s_wb_err, -EIO);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
EXPORT_SYMBOL(mark_buffer_write_io_error);
|
||||
|
||||
|
@ -415,9 +415,9 @@ EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
|
||||
* directory's encryption key, then @iname is the plaintext, so we encrypt it to
|
||||
* get the disk_name.
|
||||
*
|
||||
* Else, for keyless @lookup operations, @iname is the presented ciphertext, so
|
||||
* we decode it to get the fscrypt_nokey_name. Non-@lookup operations will be
|
||||
* impossible in this case, so we fail them with ENOKEY.
|
||||
* Else, for keyless @lookup operations, @iname should be a no-key name, so we
|
||||
* decode it to get the struct fscrypt_nokey_name. Non-@lookup operations will
|
||||
* be impossible in this case, so we fail them with ENOKEY.
|
||||
*
|
||||
* If successful, fscrypt_free_filename() must be called later to clean up.
|
||||
*
|
||||
@ -461,7 +461,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
|
||||
}
|
||||
if (!lookup)
|
||||
return -ENOKEY;
|
||||
fname->is_ciphertext_name = true;
|
||||
fname->is_nokey_name = true;
|
||||
|
||||
/*
|
||||
* We don't have the key and we are doing a lookup; decode the
|
||||
@ -571,17 +571,17 @@ int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
|
||||
|
||||
/*
|
||||
* Plaintext names are always valid, since fscrypt doesn't support
|
||||
* reverting to ciphertext names without evicting the directory's inode
|
||||
* reverting to no-key names without evicting the directory's inode
|
||||
* -- which implies eviction of the dentries in the directory.
|
||||
*/
|
||||
if (!(dentry->d_flags & DCACHE_ENCRYPTED_NAME))
|
||||
if (!(dentry->d_flags & DCACHE_NOKEY_NAME))
|
||||
return 1;
|
||||
|
||||
/*
|
||||
* Ciphertext name; valid if the directory's key is still unavailable.
|
||||
* No-key name; valid if the directory's key is still unavailable.
|
||||
*
|
||||
* Although fscrypt forbids rename() on ciphertext names, we still must
|
||||
* use dget_parent() here rather than use ->d_parent directly. That's
|
||||
* Although fscrypt forbids rename() on no-key names, we still must use
|
||||
* dget_parent() here rather than use ->d_parent directly. That's
|
||||
* because a corrupted fs image may contain directory hard links, which
|
||||
* the VFS handles by moving the directory's dentry tree in the dcache
|
||||
* each time ->lookup() finds the directory and it already has a dentry
|
||||
@ -602,4 +602,4 @@ int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
|
||||
|
||||
return valid;
|
||||
}
|
||||
EXPORT_SYMBOL(fscrypt_d_revalidate);
|
||||
EXPORT_SYMBOL_GPL(fscrypt_d_revalidate);
|
||||
|
@ -59,8 +59,8 @@ int __fscrypt_prepare_link(struct inode *inode, struct inode *dir,
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
/* ... in case we looked up ciphertext name before key was added */
|
||||
if (dentry->d_flags & DCACHE_ENCRYPTED_NAME)
|
||||
/* ... in case we looked up no-key name before key was added */
|
||||
if (dentry->d_flags & DCACHE_NOKEY_NAME)
|
||||
return -ENOKEY;
|
||||
|
||||
if (!fscrypt_has_permitted_context(dir, inode))
|
||||
@ -84,9 +84,8 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
/* ... in case we looked up ciphertext name(s) before key was added */
|
||||
if ((old_dentry->d_flags | new_dentry->d_flags) &
|
||||
DCACHE_ENCRYPTED_NAME)
|
||||
/* ... in case we looked up no-key name(s) before key was added */
|
||||
if ((old_dentry->d_flags | new_dentry->d_flags) & DCACHE_NOKEY_NAME)
|
||||
return -ENOKEY;
|
||||
|
||||
if (old_dir != new_dir) {
|
||||
@ -113,9 +112,9 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
|
||||
if (err && err != -ENOENT)
|
||||
return err;
|
||||
|
||||
if (fname->is_ciphertext_name) {
|
||||
if (fname->is_nokey_name) {
|
||||
spin_lock(&dentry->d_lock);
|
||||
dentry->d_flags |= DCACHE_ENCRYPTED_NAME;
|
||||
dentry->d_flags |= DCACHE_NOKEY_NAME;
|
||||
spin_unlock(&dentry->d_lock);
|
||||
}
|
||||
return err;
|
||||
|
@ -3157,10 +3157,6 @@ static inline void ext4_unlock_group(struct super_block *sb,
|
||||
/* dir.c */
|
||||
extern const struct file_operations ext4_dir_operations;
|
||||
|
||||
#ifdef CONFIG_UNICODE
|
||||
extern const struct dentry_operations ext4_dentry_ops;
|
||||
#endif
|
||||
|
||||
/* file.c */
|
||||
extern const struct inode_operations ext4_file_inode_operations;
|
||||
extern const struct file_operations ext4_file_operations;
|
||||
|
@ -1178,6 +1178,12 @@ out:
|
||||
return -EOPNOTSUPP;
|
||||
return fsverity_ioctl_measure(filp, (void __user *)arg);
|
||||
|
||||
case FS_IOC_READ_VERITY_METADATA:
|
||||
if (!ext4_has_feature_verity(sb))
|
||||
return -EOPNOTSUPP;
|
||||
return fsverity_ioctl_read_metadata(filp,
|
||||
(const void __user *)arg);
|
||||
|
||||
default:
|
||||
return -ENOTTY;
|
||||
}
|
||||
@ -1256,6 +1262,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
||||
case FS_IOC_MEASURE_VERITY:
|
||||
case EXT4_IOC_FSGETXATTR:
|
||||
case EXT4_IOC_FSSETXATTR:
|
||||
case FS_IOC_READ_VERITY_METADATA:
|
||||
break;
|
||||
default:
|
||||
return -ENOIOCTLCMD;
|
||||
|
@ -1689,7 +1689,7 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir,
|
||||
struct buffer_head *bh;
|
||||
|
||||
err = ext4_fname_prepare_lookup(dir, dentry, &fname);
|
||||
generic_set_encrypted_ci_d_ops(dir, dentry);
|
||||
generic_set_encrypted_ci_d_ops(dentry);
|
||||
if (err == -ENOENT)
|
||||
return NULL;
|
||||
if (err)
|
||||
|
@ -6,6 +6,13 @@ config F2FS_FS
|
||||
select CRYPTO_CRC32
|
||||
select F2FS_FS_XATTR if FS_ENCRYPTION
|
||||
select FS_ENCRYPTION_ALGS if FS_ENCRYPTION
|
||||
select LZ4_COMPRESS if F2FS_FS_LZ4
|
||||
select LZ4_DECOMPRESS if F2FS_FS_LZ4
|
||||
select LZ4HC_COMPRESS if F2FS_FS_LZ4HC
|
||||
select LZO_COMPRESS if F2FS_FS_LZO
|
||||
select LZO_DECOMPRESS if F2FS_FS_LZO
|
||||
select ZSTD_COMPRESS if F2FS_FS_ZSTD
|
||||
select ZSTD_DECOMPRESS if F2FS_FS_ZSTD
|
||||
help
|
||||
F2FS is based on Log-structured File System (LFS), which supports
|
||||
versatile "flash-friendly" features. The design has been focused on
|
||||
@ -88,16 +95,6 @@ config F2FS_FS_ENCRYPTION
|
||||
FS_ENCRYPTION. Use CONFIG_FS_ENCRYPTION=y in new config
|
||||
files.
|
||||
|
||||
config F2FS_IO_TRACE
|
||||
bool "F2FS IO tracer"
|
||||
depends on F2FS_FS
|
||||
depends on FUNCTION_TRACER
|
||||
help
|
||||
F2FS IO trace is based on a function trace, which gathers process
|
||||
information and block IO patterns in the filesystem level.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config F2FS_FAULT_INJECTION
|
||||
bool "F2FS fault injection facility"
|
||||
depends on F2FS_FS
|
||||
@ -116,26 +113,44 @@ config F2FS_FS_COMPRESSION
|
||||
config F2FS_FS_LZO
|
||||
bool "LZO compression support"
|
||||
depends on F2FS_FS_COMPRESSION
|
||||
select LZO_COMPRESS
|
||||
select LZO_DECOMPRESS
|
||||
default y
|
||||
help
|
||||
Support LZO compress algorithm, if unsure, say Y.
|
||||
|
||||
config F2FS_FS_LZORLE
|
||||
bool "LZO-RLE compression support"
|
||||
depends on F2FS_FS_LZO
|
||||
default y
|
||||
help
|
||||
Support LZO-RLE compress algorithm, if unsure, say Y.
|
||||
|
||||
config F2FS_FS_LZ4
|
||||
bool "LZ4 compression support"
|
||||
depends on F2FS_FS_COMPRESSION
|
||||
select LZ4_COMPRESS
|
||||
select LZ4_DECOMPRESS
|
||||
default y
|
||||
help
|
||||
Support LZ4 compress algorithm, if unsure, say Y.
|
||||
|
||||
config F2FS_FS_LZ4HC
|
||||
bool "LZ4HC compression support"
|
||||
depends on F2FS_FS_LZ4
|
||||
default y
|
||||
help
|
||||
Support LZ4HC compress algorithm, LZ4HC has compatible on-disk
|
||||
layout with LZ4, if unsure, say Y.
|
||||
|
||||
config F2FS_FS_ZSTD
|
||||
bool "ZSTD compression support"
|
||||
depends on F2FS_FS_COMPRESSION
|
||||
select ZSTD_COMPRESS
|
||||
select ZSTD_DECOMPRESS
|
||||
default y
|
||||
help
|
||||
Support ZSTD compress algorithm, if unsure, say Y.
|
||||
|
||||
config F2FS_IOSTAT
|
||||
bool "F2FS IO statistics information"
|
||||
depends on F2FS_FS
|
||||
default y
|
||||
help
|
||||
Support getting IO statistics through sysfs and printing out periodic
|
||||
IO statistics tracepoint events. You have to turn on "iostat_enable"
|
||||
sysfs node to enable this feature.
|
||||
|
@ -7,6 +7,6 @@ f2fs-y += shrinker.o extent_cache.o sysfs.o
|
||||
f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
|
||||
f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
|
||||
f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
|
||||
f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o
|
||||
f2fs-$(CONFIG_FS_VERITY) += verity.o
|
||||
f2fs-$(CONFIG_F2FS_FS_COMPRESSION) += compress.o
|
||||
f2fs-$(CONFIG_F2FS_IOSTAT) += iostat.o
|
||||
|
@ -29,6 +29,7 @@ static inline size_t f2fs_acl_size(int count)
|
||||
static inline int f2fs_acl_count(size_t size)
|
||||
{
|
||||
ssize_t s;
|
||||
|
||||
size -= sizeof(struct f2fs_acl_header);
|
||||
s = size - 4 * sizeof(struct f2fs_acl_entry_short);
|
||||
if (s < 0) {
|
||||
@ -160,7 +161,7 @@ static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi,
|
||||
return (void *)f2fs_acl;
|
||||
|
||||
fail:
|
||||
kvfree(f2fs_acl);
|
||||
kfree(f2fs_acl);
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
@ -190,7 +191,7 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type,
|
||||
acl = NULL;
|
||||
else
|
||||
acl = ERR_PTR(retval);
|
||||
kvfree(value);
|
||||
kfree(value);
|
||||
|
||||
return acl;
|
||||
}
|
||||
@ -200,6 +201,27 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
|
||||
return __f2fs_get_acl(inode, type, NULL);
|
||||
}
|
||||
|
||||
static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p,
|
||||
struct posix_acl **acl)
|
||||
{
|
||||
umode_t mode = inode->i_mode;
|
||||
int error;
|
||||
|
||||
if (is_inode_flag_set(inode, FI_ACL_MODE))
|
||||
mode = F2FS_I(inode)->i_acl_mode;
|
||||
|
||||
error = posix_acl_equiv_mode(*acl, &mode);
|
||||
if (error < 0)
|
||||
return error;
|
||||
if (error == 0)
|
||||
*acl = NULL;
|
||||
if (!in_group_p(inode->i_gid) &&
|
||||
!capable_wrt_inode_uidgid(inode, CAP_FSETID))
|
||||
mode &= ~S_ISGID;
|
||||
*mode_p = mode;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __f2fs_set_acl(struct inode *inode, int type,
|
||||
struct posix_acl *acl, struct page *ipage)
|
||||
{
|
||||
@ -213,7 +235,7 @@ static int __f2fs_set_acl(struct inode *inode, int type,
|
||||
case ACL_TYPE_ACCESS:
|
||||
name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
|
||||
if (acl && !ipage) {
|
||||
error = posix_acl_update_mode(inode, &mode, &acl);
|
||||
error = f2fs_acl_update_mode(inode, &mode, &acl);
|
||||
if (error)
|
||||
return error;
|
||||
set_acl_inode(inode, mode);
|
||||
@ -240,7 +262,7 @@ static int __f2fs_set_acl(struct inode *inode, int type,
|
||||
|
||||
error = f2fs_setxattr(inode, name_index, "", value, size, ipage, 0);
|
||||
|
||||
kvfree(value);
|
||||
kfree(value);
|
||||
if (!error)
|
||||
set_cached_acl(inode, type, acl);
|
||||
|
||||
@ -384,7 +406,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
|
||||
struct page *dpage)
|
||||
{
|
||||
struct posix_acl *default_acl = NULL, *acl = NULL;
|
||||
int error = 0;
|
||||
int error;
|
||||
|
||||
error = f2fs_acl_create(dir, &inode->i_mode, &default_acl, &acl, dpage);
|
||||
if (error)
|
||||
|
@ -13,13 +13,16 @@
|
||||
#include <linux/f2fs_fs.h>
|
||||
#include <linux/pagevec.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/kthread.h>
|
||||
|
||||
#include "f2fs.h"
|
||||
#include "node.h"
|
||||
#include "segment.h"
|
||||
#include "trace.h"
|
||||
#include "iostat.h"
|
||||
#include <trace/events/f2fs.h>
|
||||
|
||||
#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
|
||||
|
||||
static struct kmem_cache *ino_entry_slab;
|
||||
struct kmem_cache *f2fs_inode_entry_slab;
|
||||
|
||||
@ -37,7 +40,7 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io)
|
||||
struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
|
||||
{
|
||||
struct address_space *mapping = META_MAPPING(sbi);
|
||||
struct page *page = NULL;
|
||||
struct page *page;
|
||||
repeat:
|
||||
page = f2fs_grab_cache_page(mapping, index, false);
|
||||
if (!page) {
|
||||
@ -107,7 +110,7 @@ struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
|
||||
return __get_meta_page(sbi, index, true);
|
||||
}
|
||||
|
||||
struct page *f2fs_get_meta_page_nofail(struct f2fs_sb_info *sbi, pgoff_t index)
|
||||
struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index)
|
||||
{
|
||||
struct page *page;
|
||||
int count = 0;
|
||||
@ -279,18 +282,22 @@ out:
|
||||
return blkno - start;
|
||||
}
|
||||
|
||||
void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
|
||||
void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index,
|
||||
unsigned int ra_blocks)
|
||||
{
|
||||
struct page *page;
|
||||
bool readahead = false;
|
||||
|
||||
if (ra_blocks == RECOVERY_MIN_RA_BLOCKS)
|
||||
return;
|
||||
|
||||
page = find_get_page(META_MAPPING(sbi), index);
|
||||
if (!page || !PageUptodate(page))
|
||||
readahead = true;
|
||||
f2fs_put_page(page, 0);
|
||||
|
||||
if (readahead)
|
||||
f2fs_ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true);
|
||||
f2fs_ra_meta_pages(sbi, index, ra_blocks, META_POR, true);
|
||||
}
|
||||
|
||||
static int __f2fs_write_meta_page(struct page *page,
|
||||
@ -348,13 +355,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
|
||||
goto skip_write;
|
||||
|
||||
/* if locked failed, cp will flush dirty pages instead */
|
||||
if (!mutex_trylock(&sbi->cp_mutex))
|
||||
if (!f2fs_down_write_trylock(&sbi->cp_global_sem))
|
||||
goto skip_write;
|
||||
|
||||
trace_f2fs_writepages(mapping->host, wbc, META);
|
||||
diff = nr_pages_to_write(sbi, META, wbc);
|
||||
written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO);
|
||||
mutex_unlock(&sbi->cp_mutex);
|
||||
f2fs_up_write(&sbi->cp_global_sem);
|
||||
wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
|
||||
return 0;
|
||||
|
||||
@ -442,8 +449,7 @@ static int f2fs_set_meta_page_dirty(struct page *page)
|
||||
if (!PageDirty(page)) {
|
||||
__set_page_dirty_nobuffers(page);
|
||||
inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
|
||||
f2fs_set_page_private(page, 0);
|
||||
f2fs_trace_pid(page);
|
||||
set_page_private_reference(page);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
@ -464,16 +470,29 @@ static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino,
|
||||
unsigned int devidx, int type)
|
||||
{
|
||||
struct inode_management *im = &sbi->im[type];
|
||||
struct ino_entry *e, *tmp;
|
||||
struct ino_entry *e = NULL, *new = NULL;
|
||||
|
||||
tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS);
|
||||
if (type == FLUSH_INO) {
|
||||
rcu_read_lock();
|
||||
e = radix_tree_lookup(&im->ino_root, ino);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
retry:
|
||||
if (!e)
|
||||
new = f2fs_kmem_cache_alloc(ino_entry_slab,
|
||||
GFP_NOFS, true, NULL);
|
||||
|
||||
radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
|
||||
|
||||
spin_lock(&im->ino_lock);
|
||||
e = radix_tree_lookup(&im->ino_root, ino);
|
||||
if (!e) {
|
||||
e = tmp;
|
||||
if (!new) {
|
||||
spin_unlock(&im->ino_lock);
|
||||
goto retry;
|
||||
}
|
||||
e = new;
|
||||
if (unlikely(radix_tree_insert(&im->ino_root, ino, e)))
|
||||
f2fs_bug_on(sbi, 1);
|
||||
|
||||
@ -491,8 +510,8 @@ static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino,
|
||||
spin_unlock(&im->ino_lock);
|
||||
radix_tree_preload_end();
|
||||
|
||||
if (e != tmp)
|
||||
kmem_cache_free(ino_entry_slab, tmp);
|
||||
if (new && e != new)
|
||||
kmem_cache_free(ino_entry_slab, new);
|
||||
}
|
||||
|
||||
static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
|
||||
@ -525,7 +544,7 @@ void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
|
||||
__remove_ino_entry(sbi, ino, type);
|
||||
}
|
||||
|
||||
/* mode should be APPEND_INO or UPDATE_INO */
|
||||
/* mode should be APPEND_INO, UPDATE_INO or TRANS_DIR_INO */
|
||||
bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
|
||||
{
|
||||
struct inode_management *im = &sbi->im[mode];
|
||||
@ -638,7 +657,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
|
||||
return PTR_ERR(inode);
|
||||
}
|
||||
|
||||
err = dquot_initialize(inode);
|
||||
err = f2fs_dquot_initialize(inode);
|
||||
if (err) {
|
||||
iput(inode);
|
||||
goto err_out;
|
||||
@ -649,7 +668,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
|
||||
/* truncate all the data during iput */
|
||||
iput(inode);
|
||||
|
||||
err = f2fs_get_node_info(sbi, ino, &ni);
|
||||
err = f2fs_get_node_info(sbi, ino, &ni, false);
|
||||
if (err)
|
||||
goto err_out;
|
||||
|
||||
@ -690,9 +709,6 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi)
|
||||
}
|
||||
|
||||
#ifdef CONFIG_QUOTA
|
||||
/* Needed for iput() to work correctly and not trash data */
|
||||
sbi->sb->s_flags |= MS_ACTIVE;
|
||||
|
||||
/*
|
||||
* Turn on quotas which were not enabled for read-only mounts if
|
||||
* filesystem has quota feature, so that they are updated correctly.
|
||||
@ -718,6 +734,7 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi)
|
||||
orphan_blk = (struct f2fs_orphan_block *)page_address(page);
|
||||
for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) {
|
||||
nid_t ino = le32_to_cpu(orphan_blk->ino[j]);
|
||||
|
||||
err = recover_orphan_inode(sbi, ino);
|
||||
if (err) {
|
||||
f2fs_put_page(page, 1);
|
||||
@ -851,6 +868,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
|
||||
struct page *cp_page_1 = NULL, *cp_page_2 = NULL;
|
||||
struct f2fs_checkpoint *cp_block = NULL;
|
||||
unsigned long long cur_version = 0, pre_version = 0;
|
||||
unsigned int cp_blocks;
|
||||
int err;
|
||||
|
||||
err = get_checkpoint_version(sbi, cp_addr, &cp_block,
|
||||
@ -858,15 +876,16 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
|
||||
if (err)
|
||||
return NULL;
|
||||
|
||||
if (le32_to_cpu(cp_block->cp_pack_total_block_count) >
|
||||
sbi->blocks_per_seg) {
|
||||
cp_blocks = le32_to_cpu(cp_block->cp_pack_total_block_count);
|
||||
|
||||
if (cp_blocks > sbi->blocks_per_seg || cp_blocks <= F2FS_CP_PACKS) {
|
||||
f2fs_warn(sbi, "invalid cp_pack_total_block_count:%u",
|
||||
le32_to_cpu(cp_block->cp_pack_total_block_count));
|
||||
goto invalid_cp;
|
||||
}
|
||||
pre_version = *version;
|
||||
|
||||
cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
|
||||
cp_addr += cp_blocks - 1;
|
||||
err = get_checkpoint_version(sbi, cp_addr, &cp_block,
|
||||
&cp_page_2, version);
|
||||
if (err)
|
||||
@ -1016,8 +1035,7 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page)
|
||||
inode_inc_dirty_pages(inode);
|
||||
spin_unlock(&sbi->inode_lock[type]);
|
||||
|
||||
f2fs_set_page_private(page, 0);
|
||||
f2fs_trace_pid(page);
|
||||
set_page_private_reference(page);
|
||||
}
|
||||
|
||||
void f2fs_remove_dirty_inode(struct inode *inode)
|
||||
@ -1147,7 +1165,8 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi)
|
||||
if (!is_journalled_quota(sbi))
|
||||
return false;
|
||||
|
||||
down_write(&sbi->quota_sem);
|
||||
if (!f2fs_down_write_trylock(&sbi->quota_sem))
|
||||
return true;
|
||||
if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) {
|
||||
ret = false;
|
||||
} else if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) {
|
||||
@ -1158,7 +1177,7 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi)
|
||||
} else if (get_pages(sbi, F2FS_DIRTY_QDATA)) {
|
||||
ret = true;
|
||||
}
|
||||
up_write(&sbi->quota_sem);
|
||||
f2fs_up_write(&sbi->quota_sem);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -1215,10 +1234,10 @@ retry_flush_dents:
|
||||
* POR: we should ensure that there are no dirty node pages
|
||||
* until finishing nat/sit flush. inode->i_blocks can be updated.
|
||||
*/
|
||||
down_write(&sbi->node_change);
|
||||
f2fs_down_write(&sbi->node_change);
|
||||
|
||||
if (get_pages(sbi, F2FS_DIRTY_IMETA)) {
|
||||
up_write(&sbi->node_change);
|
||||
f2fs_up_write(&sbi->node_change);
|
||||
f2fs_unlock_all(sbi);
|
||||
err = f2fs_sync_inode_meta(sbi);
|
||||
if (err)
|
||||
@ -1228,15 +1247,15 @@ retry_flush_dents:
|
||||
}
|
||||
|
||||
retry_flush_nodes:
|
||||
down_write(&sbi->node_write);
|
||||
f2fs_down_write(&sbi->node_write);
|
||||
|
||||
if (get_pages(sbi, F2FS_DIRTY_NODES)) {
|
||||
up_write(&sbi->node_write);
|
||||
f2fs_up_write(&sbi->node_write);
|
||||
atomic_inc(&sbi->wb_sync_req[NODE]);
|
||||
err = f2fs_sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO);
|
||||
atomic_dec(&sbi->wb_sync_req[NODE]);
|
||||
if (err) {
|
||||
up_write(&sbi->node_change);
|
||||
f2fs_up_write(&sbi->node_change);
|
||||
f2fs_unlock_all(sbi);
|
||||
return err;
|
||||
}
|
||||
@ -1249,13 +1268,13 @@ retry_flush_nodes:
|
||||
* dirty node blocks and some checkpoint values by block allocation.
|
||||
*/
|
||||
__prepare_cp_block(sbi);
|
||||
up_write(&sbi->node_change);
|
||||
f2fs_up_write(&sbi->node_change);
|
||||
return err;
|
||||
}
|
||||
|
||||
static void unblock_operations(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
up_write(&sbi->node_write);
|
||||
f2fs_up_write(&sbi->node_write);
|
||||
f2fs_unlock_all(sbi);
|
||||
}
|
||||
|
||||
@ -1264,8 +1283,6 @@ void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type)
|
||||
DEFINE_WAIT(wait);
|
||||
|
||||
for (;;) {
|
||||
prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
|
||||
|
||||
if (!get_pages(sbi, type))
|
||||
break;
|
||||
|
||||
@ -1275,6 +1292,10 @@ void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type)
|
||||
if (type == F2FS_DIRTY_META)
|
||||
f2fs_sync_meta_pages(sbi, META, LONG_MAX,
|
||||
FS_CP_META_IO);
|
||||
else if (type == F2FS_WB_CP_DATA)
|
||||
f2fs_submit_merged_write(sbi, DATA);
|
||||
|
||||
prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
|
||||
io_schedule_timeout(DEFAULT_IO_TIMEOUT);
|
||||
}
|
||||
finish_wait(&sbi->cp_wait, &wait);
|
||||
@ -1286,12 +1307,20 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
|
||||
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&sbi->cp_lock, flags);
|
||||
if (cpc->reason & CP_UMOUNT) {
|
||||
if (le32_to_cpu(ckpt->cp_pack_total_block_count) +
|
||||
NM_I(sbi)->nat_bits_blocks > sbi->blocks_per_seg) {
|
||||
clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
|
||||
f2fs_notice(sbi, "Disable nat_bits due to no space");
|
||||
} else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) &&
|
||||
f2fs_nat_bitmap_enabled(sbi)) {
|
||||
f2fs_enable_nat_bits(sbi);
|
||||
set_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
|
||||
f2fs_notice(sbi, "Rebuild and enable nat_bits");
|
||||
}
|
||||
}
|
||||
|
||||
if ((cpc->reason & CP_UMOUNT) &&
|
||||
le32_to_cpu(ckpt->cp_pack_total_block_count) >
|
||||
sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks)
|
||||
disable_nat_bits(sbi, false);
|
||||
spin_lock_irqsave(&sbi->cp_lock, flags);
|
||||
|
||||
if (cpc->reason & CP_TRIMMED)
|
||||
__set_ckpt_flags(ckpt, CP_TRIMMED_FLAG);
|
||||
@ -1383,6 +1412,26 @@ static void commit_checkpoint(struct f2fs_sb_info *sbi,
|
||||
f2fs_submit_merged_write(sbi, META_FLUSH);
|
||||
}
|
||||
|
||||
static inline u64 get_sectors_written(struct block_device *bdev)
|
||||
{
|
||||
return (u64)part_stat_read(bdev->bd_part, sectors[1]);
|
||||
}
|
||||
|
||||
u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
if (f2fs_is_multi_device(sbi)) {
|
||||
u64 sectors = 0;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < sbi->s_ndevs; i++)
|
||||
sectors += get_sectors_written(FDEV(i).bdev);
|
||||
|
||||
return sectors;
|
||||
}
|
||||
|
||||
return get_sectors_written(sbi->sb->s_bdev);
|
||||
}
|
||||
|
||||
static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
|
||||
{
|
||||
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
|
||||
@ -1393,7 +1442,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
|
||||
__u32 crc32 = 0;
|
||||
int i;
|
||||
int cp_payload_blks = __cp_payload(sbi);
|
||||
struct super_block *sb = sbi->sb;
|
||||
struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
|
||||
u64 kbytes_written;
|
||||
int err;
|
||||
@ -1421,7 +1469,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
|
||||
curseg_alloc_type(sbi, i + CURSEG_HOT_DATA);
|
||||
}
|
||||
|
||||
/* 2 cp + n data seg summary + orphan inode blocks */
|
||||
/* 2 cp + n data seg summary + orphan inode blocks */
|
||||
data_sum_blocks = f2fs_npages_for_summary_flush(sbi, false);
|
||||
spin_lock_irqsave(&sbi->cp_lock, flags);
|
||||
if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
|
||||
@ -1435,7 +1483,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
|
||||
orphan_blocks);
|
||||
|
||||
if (__remain_node_summaries(cpc->reason))
|
||||
ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
|
||||
ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
|
||||
cp_payload_blks + data_sum_blocks +
|
||||
orphan_blocks + NR_CURSEG_NODE_TYPE);
|
||||
else
|
||||
@ -1458,7 +1506,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
|
||||
start_blk = __start_cp_next_addr(sbi);
|
||||
|
||||
/* write nat bits */
|
||||
if (enabled_nat_bits(sbi, cpc)) {
|
||||
if ((cpc->reason & CP_UMOUNT) &&
|
||||
is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) {
|
||||
__u64 cp_ver = cur_cp_version(ckpt);
|
||||
block_t blk;
|
||||
|
||||
@ -1488,9 +1537,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
|
||||
|
||||
/* Record write statistics in the hot node summary */
|
||||
kbytes_written = sbi->kbytes_written;
|
||||
if (sb->s_bdev->bd_part)
|
||||
kbytes_written += BD_PART_WRITTEN(sbi);
|
||||
|
||||
kbytes_written += (f2fs_get_sectors_written(sbi) -
|
||||
sbi->sectors_written_start) >> 1;
|
||||
seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written);
|
||||
|
||||
if (__remain_node_summaries(cpc->reason)) {
|
||||
@ -1501,6 +1549,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
|
||||
/* update user_block_counts */
|
||||
sbi->last_valid_block_count = sbi->total_valid_block_count;
|
||||
percpu_counter_set(&sbi->alloc_valid_block_count, 0);
|
||||
percpu_counter_set(&sbi->rf_node_block_count, 0);
|
||||
|
||||
/* Here, we have one bio having CP pack except cp pack 2 page */
|
||||
f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
|
||||
@ -1521,9 +1570,10 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
|
||||
|
||||
/*
|
||||
* invalidate intermediate page cache borrowed from meta inode which are
|
||||
* used for migration of encrypted or verity inode's blocks.
|
||||
* used for migration of encrypted, verity or compressed inode's blocks.
|
||||
*/
|
||||
if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi))
|
||||
if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi) ||
|
||||
f2fs_sb_has_compression(sbi))
|
||||
invalidate_mapping_pages(META_MAPPING(sbi),
|
||||
MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1);
|
||||
|
||||
@ -1569,7 +1619,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
|
||||
f2fs_warn(sbi, "Start checkpoint disabled!");
|
||||
}
|
||||
if (cpc->reason != CP_RESIZE)
|
||||
mutex_lock(&sbi->cp_mutex);
|
||||
f2fs_down_write(&sbi->cp_global_sem);
|
||||
|
||||
if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
|
||||
((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) ||
|
||||
@ -1597,7 +1647,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (NM_I(sbi)->dirty_nat_cnt == 0 &&
|
||||
if (NM_I(sbi)->nat_cnt[DIRTY_NAT] == 0 &&
|
||||
SIT_I(sbi)->dirty_sentries == 0 &&
|
||||
prefree_segments(sbi) == 0) {
|
||||
f2fs_flush_sit_entries(sbi, cpc);
|
||||
@ -1617,16 +1667,27 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
|
||||
|
||||
/* write cached NAT/SIT entries to NAT/SIT area */
|
||||
err = f2fs_flush_nat_entries(sbi, cpc);
|
||||
if (err)
|
||||
if (err) {
|
||||
f2fs_err(sbi, "f2fs_flush_nat_entries failed err:%d, stop checkpoint", err);
|
||||
f2fs_bug_on(sbi, !f2fs_cp_error(sbi));
|
||||
goto stop;
|
||||
}
|
||||
|
||||
f2fs_flush_sit_entries(sbi, cpc);
|
||||
|
||||
/* save inmem log status */
|
||||
f2fs_save_inmem_curseg(sbi);
|
||||
|
||||
err = do_checkpoint(sbi, cpc);
|
||||
if (err)
|
||||
if (err) {
|
||||
f2fs_err(sbi, "do_checkpoint failed err:%d, stop checkpoint", err);
|
||||
f2fs_bug_on(sbi, !f2fs_cp_error(sbi));
|
||||
f2fs_release_discard_addrs(sbi);
|
||||
else
|
||||
} else {
|
||||
f2fs_clear_prefree_segments(sbi, cpc);
|
||||
}
|
||||
|
||||
f2fs_restore_inmem_curseg(sbi);
|
||||
stop:
|
||||
unblock_operations(sbi);
|
||||
stat_inc_cp_count(sbi->stat_info);
|
||||
@ -1639,7 +1700,7 @@ stop:
|
||||
trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
|
||||
out:
|
||||
if (cpc->reason != CP_RESIZE)
|
||||
mutex_unlock(&sbi->cp_mutex);
|
||||
f2fs_up_write(&sbi->cp_global_sem);
|
||||
return err;
|
||||
}
|
||||
|
||||
@ -1657,7 +1718,7 @@ void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi)
|
||||
}
|
||||
|
||||
sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS -
|
||||
NR_CURSEG_TYPE - __cp_payload(sbi)) *
|
||||
NR_CURSEG_PERSIST_TYPE - __cp_payload(sbi)) *
|
||||
F2FS_ORPHANS_PER_BLOCK;
|
||||
}
|
||||
|
||||
@ -1681,3 +1742,178 @@ void f2fs_destroy_checkpoint_caches(void)
|
||||
kmem_cache_destroy(ino_entry_slab);
|
||||
kmem_cache_destroy(f2fs_inode_entry_slab);
|
||||
}
|
||||
|
||||
static int __write_checkpoint_sync(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
struct cp_control cpc = { .reason = CP_SYNC, };
|
||||
int err;
|
||||
|
||||
f2fs_down_write(&sbi->gc_lock);
|
||||
err = f2fs_write_checkpoint(sbi, &cpc);
|
||||
f2fs_up_write(&sbi->gc_lock);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static void __checkpoint_and_complete_reqs(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
struct ckpt_req_control *cprc = &sbi->cprc_info;
|
||||
struct ckpt_req *req, *next;
|
||||
struct llist_node *dispatch_list;
|
||||
u64 sum_diff = 0, diff, count = 0;
|
||||
int ret;
|
||||
|
||||
dispatch_list = llist_del_all(&cprc->issue_list);
|
||||
if (!dispatch_list)
|
||||
return;
|
||||
dispatch_list = llist_reverse_order(dispatch_list);
|
||||
|
||||
ret = __write_checkpoint_sync(sbi);
|
||||
atomic_inc(&cprc->issued_ckpt);
|
||||
|
||||
llist_for_each_entry_safe(req, next, dispatch_list, llnode) {
|
||||
diff = (u64)ktime_ms_delta(ktime_get(), req->queue_time);
|
||||
req->ret = ret;
|
||||
complete(&req->wait);
|
||||
|
||||
sum_diff += diff;
|
||||
count++;
|
||||
}
|
||||
atomic_sub(count, &cprc->queued_ckpt);
|
||||
atomic_add(count, &cprc->total_ckpt);
|
||||
|
||||
spin_lock(&cprc->stat_lock);
|
||||
cprc->cur_time = (unsigned int)div64_u64(sum_diff, count);
|
||||
if (cprc->peak_time < cprc->cur_time)
|
||||
cprc->peak_time = cprc->cur_time;
|
||||
spin_unlock(&cprc->stat_lock);
|
||||
}
|
||||
|
||||
static int issue_checkpoint_thread(void *data)
|
||||
{
|
||||
struct f2fs_sb_info *sbi = data;
|
||||
struct ckpt_req_control *cprc = &sbi->cprc_info;
|
||||
wait_queue_head_t *q = &cprc->ckpt_wait_queue;
|
||||
repeat:
|
||||
if (kthread_should_stop())
|
||||
return 0;
|
||||
|
||||
if (!llist_empty(&cprc->issue_list))
|
||||
__checkpoint_and_complete_reqs(sbi);
|
||||
|
||||
wait_event_interruptible(*q,
|
||||
kthread_should_stop() || !llist_empty(&cprc->issue_list));
|
||||
goto repeat;
|
||||
}
|
||||
|
||||
static void flush_remained_ckpt_reqs(struct f2fs_sb_info *sbi,
|
||||
struct ckpt_req *wait_req)
|
||||
{
|
||||
struct ckpt_req_control *cprc = &sbi->cprc_info;
|
||||
|
||||
if (!llist_empty(&cprc->issue_list)) {
|
||||
__checkpoint_and_complete_reqs(sbi);
|
||||
} else {
|
||||
/* already dispatched by issue_checkpoint_thread */
|
||||
if (wait_req)
|
||||
wait_for_completion(&wait_req->wait);
|
||||
}
|
||||
}
|
||||
|
||||
static void init_ckpt_req(struct ckpt_req *req)
|
||||
{
|
||||
memset(req, 0, sizeof(struct ckpt_req));
|
||||
|
||||
init_completion(&req->wait);
|
||||
req->queue_time = ktime_get();
|
||||
}
|
||||
|
||||
int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
struct ckpt_req_control *cprc = &sbi->cprc_info;
|
||||
struct ckpt_req req;
|
||||
struct cp_control cpc;
|
||||
|
||||
cpc.reason = __get_cp_reason(sbi);
|
||||
if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) {
|
||||
int ret;
|
||||
|
||||
f2fs_down_write(&sbi->gc_lock);
|
||||
ret = f2fs_write_checkpoint(sbi, &cpc);
|
||||
f2fs_up_write(&sbi->gc_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (!cprc->f2fs_issue_ckpt)
|
||||
return __write_checkpoint_sync(sbi);
|
||||
|
||||
init_ckpt_req(&req);
|
||||
|
||||
llist_add(&req.llnode, &cprc->issue_list);
|
||||
atomic_inc(&cprc->queued_ckpt);
|
||||
|
||||
/*
|
||||
* update issue_list before we wake up issue_checkpoint thread,
|
||||
* this smp_mb() pairs with another barrier in ___wait_event(),
|
||||
* see more details in comments of waitqueue_active().
|
||||
*/
|
||||
smp_mb();
|
||||
|
||||
if (waitqueue_active(&cprc->ckpt_wait_queue))
|
||||
wake_up(&cprc->ckpt_wait_queue);
|
||||
|
||||
if (cprc->f2fs_issue_ckpt)
|
||||
wait_for_completion(&req.wait);
|
||||
else
|
||||
flush_remained_ckpt_reqs(sbi, &req);
|
||||
|
||||
return req.ret;
|
||||
}
|
||||
|
||||
int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
dev_t dev = sbi->sb->s_bdev->bd_dev;
|
||||
struct ckpt_req_control *cprc = &sbi->cprc_info;
|
||||
|
||||
if (cprc->f2fs_issue_ckpt)
|
||||
return 0;
|
||||
|
||||
cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi,
|
||||
"f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev));
|
||||
if (IS_ERR(cprc->f2fs_issue_ckpt)) {
|
||||
cprc->f2fs_issue_ckpt = NULL;
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
struct ckpt_req_control *cprc = &sbi->cprc_info;
|
||||
|
||||
if (cprc->f2fs_issue_ckpt) {
|
||||
struct task_struct *ckpt_task = cprc->f2fs_issue_ckpt;
|
||||
|
||||
cprc->f2fs_issue_ckpt = NULL;
|
||||
kthread_stop(ckpt_task);
|
||||
|
||||
flush_remained_ckpt_reqs(sbi, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
struct ckpt_req_control *cprc = &sbi->cprc_info;
|
||||
|
||||
atomic_set(&cprc->issued_ckpt, 0);
|
||||
atomic_set(&cprc->total_ckpt, 0);
|
||||
atomic_set(&cprc->queued_ckpt, 0);
|
||||
cprc->ckpt_thread_ioprio = DEFAULT_CHECKPOINT_IOPRIO;
|
||||
init_waitqueue_head(&cprc->ckpt_wait_queue);
|
||||
init_llist_head(&cprc->issue_list);
|
||||
spin_lock_init(&cprc->stat_lock);
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
1360
fs/f2fs/data.c
1360
fs/f2fs/data.c
File diff suppressed because it is too large
Load Diff
183
fs/f2fs/debug.c
183
fs/f2fs/debug.c
@ -21,7 +21,7 @@
|
||||
#include "gc.h"
|
||||
|
||||
static LIST_HEAD(f2fs_stat_list);
|
||||
static DEFINE_MUTEX(f2fs_stat_mutex);
|
||||
static DEFINE_RAW_SPINLOCK(f2fs_stat_lock);
|
||||
#ifdef CONFIG_DEBUG_FS
|
||||
static struct dentry *f2fs_debugfs_root;
|
||||
#endif
|
||||
@ -120,6 +120,13 @@ static void update_general_status(struct f2fs_sb_info *sbi)
|
||||
atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt);
|
||||
si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks;
|
||||
}
|
||||
si->nr_issued_ckpt = atomic_read(&sbi->cprc_info.issued_ckpt);
|
||||
si->nr_total_ckpt = atomic_read(&sbi->cprc_info.total_ckpt);
|
||||
si->nr_queued_ckpt = atomic_read(&sbi->cprc_info.queued_ckpt);
|
||||
spin_lock(&sbi->cprc_info.stat_lock);
|
||||
si->cur_ckpt_time = sbi->cprc_info.cur_time;
|
||||
si->peak_ckpt_time = sbi->cprc_info.peak_time;
|
||||
spin_unlock(&sbi->cprc_info.stat_lock);
|
||||
si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
|
||||
si->rsvd_segs = reserved_segments(sbi);
|
||||
si->overp_segs = overprovision_segments(sbi);
|
||||
@ -131,7 +138,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
|
||||
si->inline_inode = atomic_read(&sbi->inline_inode);
|
||||
si->inline_dir = atomic_read(&sbi->inline_dir);
|
||||
si->compr_inode = atomic_read(&sbi->compr_inode);
|
||||
si->compr_blocks = atomic_read(&sbi->compr_blocks);
|
||||
si->compr_blocks = atomic64_read(&sbi->compr_blocks);
|
||||
si->append = sbi->im[APPEND_INO].ino_num;
|
||||
si->update = sbi->im[UPDATE_INO].ino_num;
|
||||
si->orphans = sbi->im[ORPHAN_INO].ino_num;
|
||||
@ -145,8 +152,14 @@ static void update_general_status(struct f2fs_sb_info *sbi)
|
||||
si->node_pages = NODE_MAPPING(sbi)->nrpages;
|
||||
if (sbi->meta_inode)
|
||||
si->meta_pages = META_MAPPING(sbi)->nrpages;
|
||||
si->nats = NM_I(sbi)->nat_cnt;
|
||||
si->dirty_nats = NM_I(sbi)->dirty_nat_cnt;
|
||||
#ifdef CONFIG_F2FS_FS_COMPRESSION
|
||||
if (sbi->compress_inode) {
|
||||
si->compress_pages = COMPRESS_MAPPING(sbi)->nrpages;
|
||||
si->compress_page_hit = atomic_read(&sbi->compress_page_hit);
|
||||
}
|
||||
#endif
|
||||
si->nats = NM_I(sbi)->nat_cnt[TOTAL_NAT];
|
||||
si->dirty_nats = NM_I(sbi)->nat_cnt[DIRTY_NAT];
|
||||
si->sits = MAIN_SEGS(sbi);
|
||||
si->dirty_sits = SIT_I(sbi)->dirty_sentries;
|
||||
si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID];
|
||||
@ -164,8 +177,9 @@ static void update_general_status(struct f2fs_sb_info *sbi)
|
||||
* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
|
||||
/ 2;
|
||||
si->util_invalid = 50 - si->util_free - si->util_valid;
|
||||
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) {
|
||||
for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) {
|
||||
struct curseg_info *curseg = CURSEG_I(sbi, i);
|
||||
|
||||
si->curseg[i] = curseg->segno;
|
||||
si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno);
|
||||
si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]);
|
||||
@ -174,6 +188,26 @@ static void update_general_status(struct f2fs_sb_info *sbi)
|
||||
for (i = META_CP; i < META_MAX; i++)
|
||||
si->meta_count[i] = atomic_read(&sbi->meta_count[i]);
|
||||
|
||||
for (i = 0; i < NO_CHECK_TYPE; i++) {
|
||||
si->dirty_seg[i] = 0;
|
||||
si->full_seg[i] = 0;
|
||||
si->valid_blks[i] = 0;
|
||||
}
|
||||
|
||||
for (i = 0; i < MAIN_SEGS(sbi); i++) {
|
||||
int blks = get_seg_entry(sbi, i)->valid_blocks;
|
||||
int type = get_seg_entry(sbi, i)->type;
|
||||
|
||||
if (!blks)
|
||||
continue;
|
||||
|
||||
if (blks == sbi->blocks_per_seg)
|
||||
si->full_seg[type]++;
|
||||
else
|
||||
si->dirty_seg[type]++;
|
||||
si->valid_blks[type] += blks;
|
||||
}
|
||||
|
||||
for (i = 0; i < 2; i++) {
|
||||
si->segment_count[i] = sbi->segment_count[i];
|
||||
si->block_count[i] = sbi->block_count[i];
|
||||
@ -258,9 +292,10 @@ get_cache:
|
||||
si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID] +
|
||||
NM_I(sbi)->nid_cnt[PREALLOC_NID]) *
|
||||
sizeof(struct free_nid);
|
||||
si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry);
|
||||
si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
|
||||
sizeof(struct nat_entry_set);
|
||||
si->cache_mem += NM_I(sbi)->nat_cnt[TOTAL_NAT] *
|
||||
sizeof(struct nat_entry);
|
||||
si->cache_mem += NM_I(sbi)->nat_cnt[DIRTY_NAT] *
|
||||
sizeof(struct nat_entry_set);
|
||||
si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
|
||||
for (i = 0; i < MAX_INO_ENTRY; i++)
|
||||
si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
|
||||
@ -272,21 +307,47 @@ get_cache:
|
||||
si->page_mem = 0;
|
||||
if (sbi->node_inode) {
|
||||
unsigned npages = NODE_MAPPING(sbi)->nrpages;
|
||||
|
||||
si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
|
||||
}
|
||||
if (sbi->meta_inode) {
|
||||
unsigned npages = META_MAPPING(sbi)->nrpages;
|
||||
|
||||
si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
|
||||
}
|
||||
#ifdef CONFIG_F2FS_FS_COMPRESSION
|
||||
if (sbi->compress_inode) {
|
||||
unsigned npages = COMPRESS_MAPPING(sbi)->nrpages;
|
||||
si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static char *s_flag[] = {
|
||||
[SBI_IS_DIRTY] = " fs_dirty",
|
||||
[SBI_IS_CLOSE] = " closing",
|
||||
[SBI_NEED_FSCK] = " need_fsck",
|
||||
[SBI_POR_DOING] = " recovering",
|
||||
[SBI_NEED_SB_WRITE] = " sb_dirty",
|
||||
[SBI_NEED_CP] = " need_cp",
|
||||
[SBI_IS_SHUTDOWN] = " shutdown",
|
||||
[SBI_IS_RECOVERED] = " recovered",
|
||||
[SBI_CP_DISABLED] = " cp_disabled",
|
||||
[SBI_CP_DISABLED_QUICK] = " cp_disabled_quick",
|
||||
[SBI_QUOTA_NEED_FLUSH] = " quota_need_flush",
|
||||
[SBI_QUOTA_SKIP_FLUSH] = " quota_skip_flush",
|
||||
[SBI_QUOTA_NEED_REPAIR] = " quota_need_repair",
|
||||
[SBI_IS_RESIZEFS] = " resizefs",
|
||||
[SBI_IS_FREEZING] = " freezefs",
|
||||
};
|
||||
|
||||
static int stat_show(struct seq_file *s, void *v)
|
||||
{
|
||||
struct f2fs_stat_info *si;
|
||||
int i = 0;
|
||||
int j;
|
||||
int i = 0, j = 0;
|
||||
unsigned long flags;
|
||||
|
||||
mutex_lock(&f2fs_stat_mutex);
|
||||
raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
|
||||
list_for_each_entry(si, &f2fs_stat_list, stat_list) {
|
||||
update_general_status(si->sbi);
|
||||
|
||||
@ -294,7 +355,13 @@ static int stat_show(struct seq_file *s, void *v)
|
||||
si->sbi->sb->s_bdev, i++,
|
||||
f2fs_readonly(si->sbi->sb) ? "RO": "RW",
|
||||
is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ?
|
||||
"Disabled": (f2fs_cp_error(si->sbi) ? "Error": "Good"));
|
||||
"Disabled" : (f2fs_cp_error(si->sbi) ? "Error" : "Good"));
|
||||
if (si->sbi->s_flag) {
|
||||
seq_puts(s, "[SBI:");
|
||||
for_each_set_bit(j, &si->sbi->s_flag, 32)
|
||||
seq_puts(s, s_flag[j]);
|
||||
seq_puts(s, "]\n");
|
||||
}
|
||||
seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ",
|
||||
si->sit_area_segs, si->nat_area_segs);
|
||||
seq_printf(s, "[SSA: %d] [MAIN: %d",
|
||||
@ -322,37 +389,65 @@ static int stat_show(struct seq_file *s, void *v)
|
||||
si->inline_inode);
|
||||
seq_printf(s, " - Inline_dentry Inode: %u\n",
|
||||
si->inline_dir);
|
||||
seq_printf(s, " - Compressed Inode: %u, Blocks: %u\n",
|
||||
seq_printf(s, " - Compressed Inode: %u, Blocks: %llu\n",
|
||||
si->compr_inode, si->compr_blocks);
|
||||
seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n",
|
||||
si->orphans, si->append, si->update);
|
||||
seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
|
||||
si->main_area_segs, si->main_area_sections,
|
||||
si->main_area_zones);
|
||||
seq_printf(s, " - COLD data: %d, %d, %d\n",
|
||||
seq_printf(s, " TYPE %8s %8s %8s %10s %10s %10s\n",
|
||||
"segno", "secno", "zoneno", "dirty_seg", "full_seg", "valid_blk");
|
||||
seq_printf(s, " - COLD data: %8d %8d %8d %10u %10u %10u\n",
|
||||
si->curseg[CURSEG_COLD_DATA],
|
||||
si->cursec[CURSEG_COLD_DATA],
|
||||
si->curzone[CURSEG_COLD_DATA]);
|
||||
seq_printf(s, " - WARM data: %d, %d, %d\n",
|
||||
si->curzone[CURSEG_COLD_DATA],
|
||||
si->dirty_seg[CURSEG_COLD_DATA],
|
||||
si->full_seg[CURSEG_COLD_DATA],
|
||||
si->valid_blks[CURSEG_COLD_DATA]);
|
||||
seq_printf(s, " - WARM data: %8d %8d %8d %10u %10u %10u\n",
|
||||
si->curseg[CURSEG_WARM_DATA],
|
||||
si->cursec[CURSEG_WARM_DATA],
|
||||
si->curzone[CURSEG_WARM_DATA]);
|
||||
seq_printf(s, " - HOT data: %d, %d, %d\n",
|
||||
si->curzone[CURSEG_WARM_DATA],
|
||||
si->dirty_seg[CURSEG_WARM_DATA],
|
||||
si->full_seg[CURSEG_WARM_DATA],
|
||||
si->valid_blks[CURSEG_WARM_DATA]);
|
||||
seq_printf(s, " - HOT data: %8d %8d %8d %10u %10u %10u\n",
|
||||
si->curseg[CURSEG_HOT_DATA],
|
||||
si->cursec[CURSEG_HOT_DATA],
|
||||
si->curzone[CURSEG_HOT_DATA]);
|
||||
seq_printf(s, " - Dir dnode: %d, %d, %d\n",
|
||||
si->curzone[CURSEG_HOT_DATA],
|
||||
si->dirty_seg[CURSEG_HOT_DATA],
|
||||
si->full_seg[CURSEG_HOT_DATA],
|
||||
si->valid_blks[CURSEG_HOT_DATA]);
|
||||
seq_printf(s, " - Dir dnode: %8d %8d %8d %10u %10u %10u\n",
|
||||
si->curseg[CURSEG_HOT_NODE],
|
||||
si->cursec[CURSEG_HOT_NODE],
|
||||
si->curzone[CURSEG_HOT_NODE]);
|
||||
seq_printf(s, " - File dnode: %d, %d, %d\n",
|
||||
si->curzone[CURSEG_HOT_NODE],
|
||||
si->dirty_seg[CURSEG_HOT_NODE],
|
||||
si->full_seg[CURSEG_HOT_NODE],
|
||||
si->valid_blks[CURSEG_HOT_NODE]);
|
||||
seq_printf(s, " - File dnode: %8d %8d %8d %10u %10u %10u\n",
|
||||
si->curseg[CURSEG_WARM_NODE],
|
||||
si->cursec[CURSEG_WARM_NODE],
|
||||
si->curzone[CURSEG_WARM_NODE]);
|
||||
seq_printf(s, " - Indir nodes: %d, %d, %d\n",
|
||||
si->curzone[CURSEG_WARM_NODE],
|
||||
si->dirty_seg[CURSEG_WARM_NODE],
|
||||
si->full_seg[CURSEG_WARM_NODE],
|
||||
si->valid_blks[CURSEG_WARM_NODE]);
|
||||
seq_printf(s, " - Indir nodes: %8d %8d %8d %10u %10u %10u\n",
|
||||
si->curseg[CURSEG_COLD_NODE],
|
||||
si->cursec[CURSEG_COLD_NODE],
|
||||
si->curzone[CURSEG_COLD_NODE]);
|
||||
si->curzone[CURSEG_COLD_NODE],
|
||||
si->dirty_seg[CURSEG_COLD_NODE],
|
||||
si->full_seg[CURSEG_COLD_NODE],
|
||||
si->valid_blks[CURSEG_COLD_NODE]);
|
||||
seq_printf(s, " - Pinned file: %8d %8d %8d\n",
|
||||
si->curseg[CURSEG_COLD_DATA_PINNED],
|
||||
si->cursec[CURSEG_COLD_DATA_PINNED],
|
||||
si->curzone[CURSEG_COLD_DATA_PINNED]);
|
||||
seq_printf(s, " - ATGC data: %8d %8d %8d\n",
|
||||
si->curseg[CURSEG_ALL_DATA_ATGC],
|
||||
si->cursec[CURSEG_ALL_DATA_ATGC],
|
||||
si->curzone[CURSEG_ALL_DATA_ATGC]);
|
||||
seq_printf(s, "\n - Valid: %d\n - Dirty: %d\n",
|
||||
si->main_area_segs - si->dirty_count -
|
||||
si->prefree_count - si->free_segs,
|
||||
@ -368,12 +463,28 @@ static int stat_show(struct seq_file *s, void *v)
|
||||
si->meta_count[META_NAT]);
|
||||
seq_printf(s, " - ssa blocks : %u\n",
|
||||
si->meta_count[META_SSA]);
|
||||
seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, "
|
||||
"Cur time: %4d(ms), Peak time: %4d(ms))\n",
|
||||
si->nr_queued_ckpt, si->nr_issued_ckpt,
|
||||
si->nr_total_ckpt, si->cur_ckpt_time,
|
||||
si->peak_ckpt_time);
|
||||
seq_printf(s, "GC calls: %d (BG: %d)\n",
|
||||
si->call_count, si->bg_gc);
|
||||
seq_printf(s, " - data segments : %d (%d)\n",
|
||||
si->data_segs, si->bg_data_segs);
|
||||
seq_printf(s, " - node segments : %d (%d)\n",
|
||||
si->node_segs, si->bg_node_segs);
|
||||
seq_printf(s, " - Reclaimed segs : Normal (%d), Idle CB (%d), "
|
||||
"Idle Greedy (%d), Idle AT (%d), "
|
||||
"Urgent High (%d), Urgent Mid (%d), "
|
||||
"Urgent Low (%d)\n",
|
||||
si->sbi->gc_reclaimed_segs[GC_NORMAL],
|
||||
si->sbi->gc_reclaimed_segs[GC_IDLE_CB],
|
||||
si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY],
|
||||
si->sbi->gc_reclaimed_segs[GC_IDLE_AT],
|
||||
si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH],
|
||||
si->sbi->gc_reclaimed_segs[GC_URGENT_MID],
|
||||
si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]);
|
||||
seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks,
|
||||
si->bg_data_blks + si->bg_node_blks);
|
||||
seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks,
|
||||
@ -412,6 +523,7 @@ static int stat_show(struct seq_file *s, void *v)
|
||||
"volatile IO: %4d (Max. %4d)\n",
|
||||
si->inmem_pages, si->aw_cnt, si->max_aw_cnt,
|
||||
si->vw_cnt, si->max_vw_cnt);
|
||||
seq_printf(s, " - compress: %4d, hit:%8d\n", si->compress_pages, si->compress_page_hit);
|
||||
seq_printf(s, " - nodes: %4d in %4d\n",
|
||||
si->ndirty_node, si->node_pages);
|
||||
seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n",
|
||||
@ -424,6 +536,9 @@ static int stat_show(struct seq_file *s, void *v)
|
||||
si->ndirty_meta, si->meta_pages);
|
||||
seq_printf(s, " - imeta: %4d\n",
|
||||
si->ndirty_imeta);
|
||||
seq_printf(s, " - fsync mark: %4lld\n",
|
||||
percpu_counter_sum_positive(
|
||||
&si->sbi->rf_node_block_count));
|
||||
seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n",
|
||||
si->dirty_nats, si->nats, si->dirty_sits, si->sits);
|
||||
seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n",
|
||||
@ -465,7 +580,7 @@ static int stat_show(struct seq_file *s, void *v)
|
||||
seq_printf(s, " - paged : %llu KB\n",
|
||||
si->page_mem >> 10);
|
||||
}
|
||||
mutex_unlock(&f2fs_stat_mutex);
|
||||
raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -487,6 +602,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
|
||||
struct f2fs_stat_info *si;
|
||||
unsigned long flags;
|
||||
int i;
|
||||
|
||||
si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL);
|
||||
@ -513,7 +629,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
|
||||
atomic_set(&sbi->inline_inode, 0);
|
||||
atomic_set(&sbi->inline_dir, 0);
|
||||
atomic_set(&sbi->compr_inode, 0);
|
||||
atomic_set(&sbi->compr_blocks, 0);
|
||||
atomic64_set(&sbi->compr_blocks, 0);
|
||||
atomic_set(&sbi->inplace_count, 0);
|
||||
for (i = META_CP; i < META_MAX; i++)
|
||||
atomic_set(&sbi->meta_count[i], 0);
|
||||
@ -522,9 +638,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
|
||||
atomic_set(&sbi->max_aw_cnt, 0);
|
||||
atomic_set(&sbi->max_vw_cnt, 0);
|
||||
|
||||
mutex_lock(&f2fs_stat_mutex);
|
||||
raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
|
||||
list_add_tail(&si->stat_list, &f2fs_stat_list);
|
||||
mutex_unlock(&f2fs_stat_mutex);
|
||||
raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -532,12 +648,13 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
|
||||
void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
struct f2fs_stat_info *si = F2FS_STAT(sbi);
|
||||
unsigned long flags;
|
||||
|
||||
mutex_lock(&f2fs_stat_mutex);
|
||||
raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
|
||||
list_del(&si->stat_list);
|
||||
mutex_unlock(&f2fs_stat_mutex);
|
||||
raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags);
|
||||
|
||||
kvfree(si);
|
||||
kfree(si);
|
||||
}
|
||||
|
||||
void __init f2fs_create_root_stats(void)
|
||||
@ -545,7 +662,7 @@ void __init f2fs_create_root_stats(void)
|
||||
#ifdef CONFIG_DEBUG_FS
|
||||
f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);
|
||||
|
||||
debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL,
|
||||
debugfs_create_file("status", 0444, f2fs_debugfs_root, NULL,
|
||||
&stat_fops);
|
||||
#endif
|
||||
}
|
||||
|
137
fs/f2fs/dir.c
137
fs/f2fs/dir.c
@ -16,6 +16,10 @@
|
||||
#include "xattr.h"
|
||||
#include <trace/events/f2fs.h>
|
||||
|
||||
#ifdef CONFIG_UNICODE
|
||||
extern struct kmem_cache *f2fs_cf_name_slab;
|
||||
#endif
|
||||
|
||||
static unsigned long dir_blocks(struct inode *inode)
|
||||
{
|
||||
return ((unsigned long long) (i_size_read(inode) + PAGE_SIZE - 1))
|
||||
@ -76,21 +80,21 @@ int f2fs_init_casefolded_name(const struct inode *dir,
|
||||
struct f2fs_filename *fname)
|
||||
{
|
||||
#ifdef CONFIG_UNICODE
|
||||
struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
|
||||
struct super_block *sb = dir->i_sb;
|
||||
|
||||
if (IS_CASEFOLDED(dir)) {
|
||||
fname->cf_name.name = f2fs_kmalloc(sbi, F2FS_NAME_LEN,
|
||||
GFP_NOFS);
|
||||
fname->cf_name.name = f2fs_kmem_cache_alloc(f2fs_cf_name_slab,
|
||||
GFP_NOFS, false, F2FS_SB(sb));
|
||||
if (!fname->cf_name.name)
|
||||
return -ENOMEM;
|
||||
fname->cf_name.len = utf8_casefold(sbi->sb->s_encoding,
|
||||
fname->cf_name.len = utf8_casefold(sb->s_encoding,
|
||||
fname->usr_fname,
|
||||
fname->cf_name.name,
|
||||
F2FS_NAME_LEN);
|
||||
F2FS_NAME_LEN);s
|
||||
if ((int)fname->cf_name.len <= 0) {
|
||||
kfree(fname->cf_name.name);
|
||||
kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name);
|
||||
fname->cf_name.name = NULL;
|
||||
if (sb_has_enc_strict_mode(dir->i_sb))
|
||||
if (sb_has_strict_encoding(sb))
|
||||
return -EINVAL;
|
||||
/* fall back to treating name as opaque byte sequence */
|
||||
}
|
||||
@ -112,7 +116,7 @@ static int __f2fs_setup_filename(const struct inode *dir,
|
||||
#ifdef CONFIG_FS_ENCRYPTION
|
||||
fname->crypto_buf = crypt_name->crypto_buf;
|
||||
#endif
|
||||
if (crypt_name->is_ciphertext_name) {
|
||||
if (crypt_name->is_nokey_name) {
|
||||
/* hash was decoded from the no-key name */
|
||||
fname->hash = cpu_to_le32(crypt_name->hash);
|
||||
} else {
|
||||
@ -171,8 +175,10 @@ void f2fs_free_filename(struct f2fs_filename *fname)
|
||||
fname->crypto_buf.name = NULL;
|
||||
#endif
|
||||
#ifdef CONFIG_UNICODE
|
||||
kfree(fname->cf_name.name);
|
||||
fname->cf_name.name = NULL;
|
||||
if (fname->cf_name.name) {
|
||||
kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name);
|
||||
fname->cf_name.name = NULL;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -191,29 +197,25 @@ static unsigned long dir_block_index(unsigned int level,
|
||||
static struct f2fs_dir_entry *find_in_block(struct inode *dir,
|
||||
struct page *dentry_page,
|
||||
const struct f2fs_filename *fname,
|
||||
int *max_slots,
|
||||
struct page **res_page)
|
||||
int *max_slots)
|
||||
{
|
||||
struct f2fs_dentry_block *dentry_blk;
|
||||
struct f2fs_dir_entry *de;
|
||||
struct f2fs_dentry_ptr d;
|
||||
|
||||
dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page);
|
||||
|
||||
make_dentry_ptr_block(dir, &d, dentry_blk);
|
||||
de = f2fs_find_target_dentry(&d, fname, max_slots);
|
||||
if (de)
|
||||
*res_page = dentry_page;
|
||||
|
||||
return de;
|
||||
return f2fs_find_target_dentry(&d, fname, max_slots);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_UNICODE
|
||||
/*
|
||||
* Test whether a case-insensitive directory entry matches the filename
|
||||
* being searched for.
|
||||
*
|
||||
* Returns 1 for a match, 0 for no match, and -errno on an error.
|
||||
*/
|
||||
static bool f2fs_match_ci_name(const struct inode *dir, const struct qstr *name,
|
||||
static int f2fs_match_ci_name(const struct inode *dir, const struct qstr *name,
|
||||
const u8 *de_name, u32 de_name_len)
|
||||
{
|
||||
const struct super_block *sb = dir->i_sb;
|
||||
@ -227,11 +229,11 @@ static bool f2fs_match_ci_name(const struct inode *dir, const struct qstr *name,
|
||||
FSTR_INIT((u8 *)de_name, de_name_len);
|
||||
|
||||
if (WARN_ON_ONCE(!fscrypt_has_encryption_key(dir)))
|
||||
return false;
|
||||
return -EINVAL;
|
||||
|
||||
decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL);
|
||||
if (!decrypted_name.name)
|
||||
return false;
|
||||
return -ENOMEM;
|
||||
res = fscrypt_fname_disk_to_usr(dir, 0, 0, &encrypted_name,
|
||||
&decrypted_name);
|
||||
if (res < 0)
|
||||
@ -241,23 +243,24 @@ static bool f2fs_match_ci_name(const struct inode *dir, const struct qstr *name,
|
||||
}
|
||||
|
||||
res = utf8_strncasecmp_folded(um, name, &entry);
|
||||
if (res < 0) {
|
||||
/*
|
||||
* In strict mode, ignore invalid names. In non-strict mode,
|
||||
* fall back to treating them as opaque byte sequences.
|
||||
*/
|
||||
if (sb_has_enc_strict_mode(sb) || name->len != entry.len)
|
||||
res = 1;
|
||||
else
|
||||
res = memcmp(name->name, entry.name, name->len);
|
||||
/*
|
||||
* In strict mode, ignore invalid names. In non-strict mode,
|
||||
* fall back to treating them as opaque byte sequences.
|
||||
*/
|
||||
if (res < 0 && !sb_has_strict_encoding(sb)) {
|
||||
res = name->len == entry.len &&
|
||||
memcmp(name->name, entry.name, name->len) == 0;
|
||||
} else {
|
||||
/* utf8_strncasecmp_folded returns 0 on match */
|
||||
res = (res == 0);
|
||||
}
|
||||
out:
|
||||
kfree(decrypted_name.name);
|
||||
return res == 0;
|
||||
return res;
|
||||
}
|
||||
#endif /* CONFIG_UNICODE */
|
||||
|
||||
static inline bool f2fs_match_name(const struct inode *dir,
|
||||
static inline int f2fs_match_name(const struct inode *dir,
|
||||
const struct f2fs_filename *fname,
|
||||
const u8 *de_name, u32 de_name_len)
|
||||
{
|
||||
@ -284,6 +287,7 @@ struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d,
|
||||
struct f2fs_dir_entry *de;
|
||||
unsigned long bit_pos = 0;
|
||||
int max_len = 0;
|
||||
int res = 0;
|
||||
|
||||
if (max_slots)
|
||||
*max_slots = 0;
|
||||
@ -301,10 +305,15 @@ struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d,
|
||||
continue;
|
||||
}
|
||||
|
||||
if (de->hash_code == fname->hash &&
|
||||
f2fs_match_name(d->inode, fname, d->filename[bit_pos],
|
||||
le16_to_cpu(de->name_len)))
|
||||
goto found;
|
||||
if (de->hash_code == fname->hash) {
|
||||
res = f2fs_match_name(d->inode, fname,
|
||||
d->filename[bit_pos],
|
||||
le16_to_cpu(de->name_len));
|
||||
if (res < 0)
|
||||
return ERR_PTR(res);
|
||||
if (res)
|
||||
goto found;
|
||||
}
|
||||
|
||||
if (max_slots && max_len > *max_slots)
|
||||
*max_slots = max_len;
|
||||
@ -353,10 +362,15 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
|
||||
}
|
||||
}
|
||||
|
||||
de = find_in_block(dir, dentry_page, fname, &max_slots,
|
||||
res_page);
|
||||
if (de)
|
||||
de = find_in_block(dir, dentry_page, fname, &max_slots);
|
||||
if (IS_ERR(de)) {
|
||||
*res_page = ERR_CAST(de);
|
||||
de = NULL;
|
||||
break;
|
||||
} else if (de) {
|
||||
*res_page = dentry_page;
|
||||
break;
|
||||
}
|
||||
|
||||
if (max_slots >= s)
|
||||
room = true;
|
||||
@ -380,16 +394,15 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
|
||||
unsigned int max_depth;
|
||||
unsigned int level;
|
||||
|
||||
*res_page = NULL;
|
||||
|
||||
if (f2fs_has_inline_dentry(dir)) {
|
||||
*res_page = NULL;
|
||||
de = f2fs_find_in_inline_dir(dir, fname, res_page);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (npages == 0) {
|
||||
*res_page = NULL;
|
||||
if (npages == 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
max_depth = F2FS_I(dir)->i_current_depth;
|
||||
if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) {
|
||||
@ -400,7 +413,6 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
|
||||
}
|
||||
|
||||
for (level = 0; level < max_depth; level++) {
|
||||
*res_page = NULL;
|
||||
de = find_in_level(dir, level, fname, res_page);
|
||||
if (de || IS_ERR(*res_page))
|
||||
break;
|
||||
@ -466,6 +478,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
|
||||
struct page *page, struct inode *inode)
|
||||
{
|
||||
enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA;
|
||||
|
||||
lock_page(page);
|
||||
f2fs_wait_on_page_writeback(page, type, true, true);
|
||||
de->ino = cpu_to_le32(inode->i_ino);
|
||||
@ -755,7 +768,7 @@ add_dentry:
|
||||
f2fs_wait_on_page_writeback(dentry_page, DATA, true, true);
|
||||
|
||||
if (inode) {
|
||||
down_write(&F2FS_I(inode)->i_sem);
|
||||
f2fs_down_write(&F2FS_I(inode)->i_sem);
|
||||
page = f2fs_init_inode_metadata(inode, dir, fname, NULL);
|
||||
if (IS_ERR(page)) {
|
||||
err = PTR_ERR(page);
|
||||
@ -782,7 +795,7 @@ add_dentry:
|
||||
f2fs_update_parent_metadata(dir, inode, current_depth);
|
||||
fail:
|
||||
if (inode)
|
||||
up_write(&F2FS_I(inode)->i_sem);
|
||||
f2fs_up_write(&F2FS_I(inode)->i_sem);
|
||||
|
||||
f2fs_put_page(dentry_page, 1);
|
||||
|
||||
@ -820,7 +833,7 @@ int f2fs_do_add_link(struct inode *dir, const struct qstr *name,
|
||||
return err;
|
||||
|
||||
/*
|
||||
* An immature stakable filesystem shows a race condition between lookup
|
||||
* An immature stackable filesystem shows a race condition between lookup
|
||||
* and create. If we have same task when doing lookup and create, it's
|
||||
* definitely fine as expected by VFS normally. Otherwise, let's just
|
||||
* verify on-disk dentry one more time, which guarantees filesystem
|
||||
@ -847,7 +860,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
|
||||
struct page *page;
|
||||
int err = 0;
|
||||
|
||||
down_write(&F2FS_I(inode)->i_sem);
|
||||
f2fs_down_write(&F2FS_I(inode)->i_sem);
|
||||
page = f2fs_init_inode_metadata(inode, dir, NULL, NULL);
|
||||
if (IS_ERR(page)) {
|
||||
err = PTR_ERR(page);
|
||||
@ -858,7 +871,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
|
||||
clear_inode_flag(inode, FI_NEW_INODE);
|
||||
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
|
||||
fail:
|
||||
up_write(&F2FS_I(inode)->i_sem);
|
||||
f2fs_up_write(&F2FS_I(inode)->i_sem);
|
||||
return err;
|
||||
}
|
||||
|
||||
@ -866,7 +879,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode)
|
||||
{
|
||||
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
|
||||
|
||||
down_write(&F2FS_I(inode)->i_sem);
|
||||
f2fs_down_write(&F2FS_I(inode)->i_sem);
|
||||
|
||||
if (S_ISDIR(inode->i_mode))
|
||||
f2fs_i_links_write(dir, false);
|
||||
@ -877,7 +890,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode)
|
||||
f2fs_i_links_write(inode, false);
|
||||
f2fs_i_size_write(inode, 0);
|
||||
}
|
||||
up_write(&F2FS_I(inode)->i_sem);
|
||||
f2fs_up_write(&F2FS_I(inode)->i_sem);
|
||||
|
||||
if (inode->i_nlink == 0)
|
||||
f2fs_add_orphan_inode(inode);
|
||||
@ -923,11 +936,15 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
|
||||
!f2fs_truncate_hole(dir, page->index, page->index + 1)) {
|
||||
f2fs_clear_radix_tree_dirty_tag(page);
|
||||
clear_page_dirty_for_io(page);
|
||||
f2fs_clear_page_private(page);
|
||||
ClearPageUptodate(page);
|
||||
clear_cold_data(page);
|
||||
|
||||
clear_page_private_gcing(page);
|
||||
|
||||
inode_dec_dirty_pages(dir);
|
||||
f2fs_remove_dirty_inode(dir);
|
||||
|
||||
detach_page_private(page);
|
||||
set_page_private(page, 0);
|
||||
}
|
||||
f2fs_put_page(page, 1);
|
||||
|
||||
@ -985,6 +1002,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
|
||||
struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode);
|
||||
struct blk_plug plug;
|
||||
bool readdir_ra = sbi->readdir_ra == 1;
|
||||
bool found_valid_dirent = false;
|
||||
int err = 0;
|
||||
|
||||
bit_pos = ((unsigned long)ctx->pos % d->max);
|
||||
@ -999,13 +1017,15 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
|
||||
|
||||
de = &d->dentry[bit_pos];
|
||||
if (de->name_len == 0) {
|
||||
if (found_valid_dirent || !bit_pos) {
|
||||
printk_ratelimited(
|
||||
"%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.",
|
||||
KERN_WARNING, sbi->sb->s_id,
|
||||
le32_to_cpu(de->ino));
|
||||
set_sbi_flag(sbi, SBI_NEED_FSCK);
|
||||
}
|
||||
bit_pos++;
|
||||
ctx->pos = start_pos + bit_pos;
|
||||
printk_ratelimited(
|
||||
"%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.",
|
||||
KERN_WARNING, sbi->sb->s_id,
|
||||
le32_to_cpu(de->ino));
|
||||
set_sbi_flag(sbi, SBI_NEED_FSCK);
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -1048,6 +1068,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
|
||||
f2fs_ra_node_page(sbi, le32_to_cpu(de->ino));
|
||||
|
||||
ctx->pos = start_pos + bit_pos;
|
||||
found_valid_dirent = true;
|
||||
}
|
||||
out:
|
||||
if (readdir_ra)
|
||||
|
@ -58,6 +58,29 @@ struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root,
|
||||
return re;
|
||||
}
|
||||
|
||||
struct rb_node **f2fs_lookup_rb_tree_ext(struct f2fs_sb_info *sbi,
|
||||
struct rb_root_cached *root,
|
||||
struct rb_node **parent,
|
||||
unsigned long long key, bool *leftmost)
|
||||
{
|
||||
struct rb_node **p = &root->rb_root.rb_node;
|
||||
struct rb_entry *re;
|
||||
|
||||
while (*p) {
|
||||
*parent = *p;
|
||||
re = rb_entry(*parent, struct rb_entry, rb_node);
|
||||
|
||||
if (key < re->key) {
|
||||
p = &(*p)->rb_left;
|
||||
} else {
|
||||
p = &(*p)->rb_right;
|
||||
*leftmost = false;
|
||||
}
|
||||
}
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi,
|
||||
struct rb_root_cached *root,
|
||||
struct rb_node **parent,
|
||||
@ -166,7 +189,7 @@ lookup_neighbors:
|
||||
}
|
||||
|
||||
bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi,
|
||||
struct rb_root_cached *root)
|
||||
struct rb_root_cached *root, bool check_key)
|
||||
{
|
||||
#ifdef CONFIG_F2FS_CHECK_FS
|
||||
struct rb_node *cur = rb_first_cached(root), *next;
|
||||
@ -183,13 +206,23 @@ bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi,
|
||||
cur_re = rb_entry(cur, struct rb_entry, rb_node);
|
||||
next_re = rb_entry(next, struct rb_entry, rb_node);
|
||||
|
||||
if (check_key) {
|
||||
if (cur_re->key > next_re->key) {
|
||||
f2fs_info(sbi, "inconsistent rbtree, "
|
||||
"cur(%llu) next(%llu)",
|
||||
cur_re->key, next_re->key);
|
||||
return false;
|
||||
}
|
||||
goto next;
|
||||
}
|
||||
|
||||
if (cur_re->ofs + cur_re->len > next_re->ofs) {
|
||||
f2fs_info(sbi, "inconsistent rbtree, cur(%u, %u) next(%u, %u)",
|
||||
cur_re->ofs, cur_re->len,
|
||||
next_re->ofs, next_re->len);
|
||||
return false;
|
||||
}
|
||||
|
||||
next:
|
||||
cur = next;
|
||||
}
|
||||
#endif
|
||||
@ -206,7 +239,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
|
||||
{
|
||||
struct extent_node *en;
|
||||
|
||||
en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC);
|
||||
en = f2fs_kmem_cache_alloc(extent_node_slab, GFP_ATOMIC, false, sbi);
|
||||
if (!en)
|
||||
return NULL;
|
||||
|
||||
@ -259,7 +292,8 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode)
|
||||
mutex_lock(&sbi->extent_tree_lock);
|
||||
et = radix_tree_lookup(&sbi->extent_tree_root, ino);
|
||||
if (!et) {
|
||||
et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS);
|
||||
et = f2fs_kmem_cache_alloc(extent_tree_slab,
|
||||
GFP_NOFS, true, NULL);
|
||||
f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et);
|
||||
memset(et, 0, sizeof(struct extent_tree));
|
||||
et->ino = ino;
|
||||
@ -325,9 +359,10 @@ static void __drop_largest_extent(struct extent_tree *et,
|
||||
}
|
||||
|
||||
/* return true, if inode page is changed */
|
||||
static bool __f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
|
||||
static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage)
|
||||
{
|
||||
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
|
||||
struct f2fs_extent *i_ext = ipage ? &F2FS_INODE(ipage)->i_ext : NULL;
|
||||
struct extent_tree *et;
|
||||
struct extent_node *en;
|
||||
struct extent_info ei;
|
||||
@ -335,16 +370,18 @@ static bool __f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_e
|
||||
if (!f2fs_may_extent_tree(inode)) {
|
||||
/* drop largest extent */
|
||||
if (i_ext && i_ext->len) {
|
||||
f2fs_wait_on_page_writeback(ipage, NODE, true, true);
|
||||
i_ext->len = 0;
|
||||
return true;
|
||||
set_page_dirty(ipage);
|
||||
return;
|
||||
}
|
||||
return false;
|
||||
return;
|
||||
}
|
||||
|
||||
et = __grab_extent_tree(inode);
|
||||
|
||||
if (!i_ext || !i_ext->len)
|
||||
return false;
|
||||
return;
|
||||
|
||||
get_extent_info(&ei, i_ext);
|
||||
|
||||
@ -360,17 +397,14 @@ static bool __f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_e
|
||||
}
|
||||
out:
|
||||
write_unlock(&et->lock);
|
||||
return false;
|
||||
}
|
||||
|
||||
bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
|
||||
void f2fs_init_extent_tree(struct inode *inode, struct page *ipage)
|
||||
{
|
||||
bool ret = __f2fs_init_extent_tree(inode, i_ext);
|
||||
__f2fs_init_extent_tree(inode, ipage);
|
||||
|
||||
if (!F2FS_I(inode)->extent_tree)
|
||||
set_inode_flag(inode, FI_NO_EXTENT);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
|
||||
@ -628,6 +662,47 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
|
||||
f2fs_mark_inode_dirty_sync(inode, true);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_F2FS_FS_COMPRESSION
|
||||
void f2fs_update_extent_tree_range_compressed(struct inode *inode,
|
||||
pgoff_t fofs, block_t blkaddr, unsigned int llen,
|
||||
unsigned int c_len)
|
||||
{
|
||||
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
|
||||
struct extent_tree *et = F2FS_I(inode)->extent_tree;
|
||||
struct extent_node *en = NULL;
|
||||
struct extent_node *prev_en = NULL, *next_en = NULL;
|
||||
struct extent_info ei;
|
||||
struct rb_node **insert_p = NULL, *insert_parent = NULL;
|
||||
bool leftmost = false;
|
||||
|
||||
trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen);
|
||||
|
||||
/* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */
|
||||
if (is_inode_flag_set(inode, FI_NO_EXTENT))
|
||||
return;
|
||||
|
||||
write_lock(&et->lock);
|
||||
|
||||
en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root,
|
||||
(struct rb_entry *)et->cached_en, fofs,
|
||||
(struct rb_entry **)&prev_en,
|
||||
(struct rb_entry **)&next_en,
|
||||
&insert_p, &insert_parent, false,
|
||||
&leftmost);
|
||||
if (en)
|
||||
goto unlock_out;
|
||||
|
||||
set_extent_info(&ei, fofs, blkaddr, llen);
|
||||
ei.c_len = c_len;
|
||||
|
||||
if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
|
||||
__insert_extent_tree(sbi, et, &ei,
|
||||
insert_p, insert_parent, leftmost);
|
||||
unlock_out:
|
||||
write_unlock(&et->lock);
|
||||
}
|
||||
#endif
|
||||
|
||||
unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
|
||||
{
|
||||
struct extent_tree *et, *next;
|
||||
|
1094
fs/f2fs/f2fs.h
1094
fs/f2fs/f2fs.h
File diff suppressed because it is too large
Load Diff
1224
fs/f2fs/file.c
1224
fs/f2fs/file.c
File diff suppressed because it is too large
Load Diff
677
fs/f2fs/gc.c
677
fs/f2fs/gc.c
File diff suppressed because it is too large
Load Diff
75
fs/f2fs/gc.h
75
fs/f2fs/gc.h
@ -14,6 +14,14 @@
|
||||
#define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */
|
||||
#define DEF_GC_THREAD_MAX_SLEEP_TIME 60000
|
||||
#define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */
|
||||
|
||||
/* choose candidates from sections which has age of more than 7 days */
|
||||
#define DEF_GC_THREAD_AGE_THRESHOLD (60 * 60 * 24 * 7)
|
||||
#define DEF_GC_THREAD_CANDIDATE_RATIO 20 /* select 20% oldest sections as candidates */
|
||||
#define DEF_GC_THREAD_MAX_CANDIDATE_COUNT 10 /* select at most 10 sections as candidates */
|
||||
#define DEF_GC_THREAD_AGE_WEIGHT 60 /* age weight */
|
||||
#define DEFAULT_ACCURACY_CLASS 10000 /* accuracy class */
|
||||
|
||||
#define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */
|
||||
#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */
|
||||
|
||||
@ -34,6 +42,12 @@ struct f2fs_gc_kthread {
|
||||
|
||||
/* for changing gc mode */
|
||||
unsigned int gc_wake;
|
||||
|
||||
/* for GC_MERGE mount option */
|
||||
wait_queue_head_t fggc_wq; /*
|
||||
* caller of f2fs_balance_fs()
|
||||
* will wait on this wait queue.
|
||||
*/
|
||||
};
|
||||
|
||||
struct gc_inode_list {
|
||||
@ -41,16 +55,69 @@ struct gc_inode_list {
|
||||
struct radix_tree_root iroot;
|
||||
};
|
||||
|
||||
struct victim_info {
|
||||
unsigned long long mtime; /* mtime of section */
|
||||
unsigned int segno; /* section No. */
|
||||
};
|
||||
|
||||
struct victim_entry {
|
||||
struct rb_node rb_node; /* rb node located in rb-tree */
|
||||
union {
|
||||
struct {
|
||||
unsigned long long mtime; /* mtime of section */
|
||||
unsigned int segno; /* segment No. */
|
||||
};
|
||||
struct victim_info vi; /* victim info */
|
||||
};
|
||||
struct list_head list;
|
||||
};
|
||||
|
||||
/*
|
||||
* inline functions
|
||||
*/
|
||||
|
||||
/*
|
||||
* On a Zoned device zone-capacity can be less than zone-size and if
|
||||
* zone-capacity is not aligned to f2fs segment size(2MB), then the segment
|
||||
* starting just before zone-capacity has some blocks spanning across the
|
||||
* zone-capacity, these blocks are not usable.
|
||||
* Such spanning segments can be in free list so calculate the sum of usable
|
||||
* blocks in currently free segments including normal and spanning segments.
|
||||
*/
|
||||
static inline block_t free_segs_blk_count_zoned(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
block_t free_seg_blks = 0;
|
||||
struct free_segmap_info *free_i = FREE_I(sbi);
|
||||
int j;
|
||||
|
||||
spin_lock(&free_i->segmap_lock);
|
||||
for (j = 0; j < MAIN_SEGS(sbi); j++)
|
||||
if (!test_bit(j, free_i->free_segmap))
|
||||
free_seg_blks += f2fs_usable_blks_in_seg(sbi, j);
|
||||
spin_unlock(&free_i->segmap_lock);
|
||||
|
||||
return free_seg_blks;
|
||||
}
|
||||
|
||||
static inline block_t free_segs_blk_count(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
if (f2fs_sb_has_blkzoned(sbi))
|
||||
return free_segs_blk_count_zoned(sbi);
|
||||
|
||||
return free_segments(sbi) << sbi->log_blocks_per_seg;
|
||||
}
|
||||
|
||||
static inline block_t free_user_blocks(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
if (free_segments(sbi) < overprovision_segments(sbi))
|
||||
block_t free_blks, ovp_blks;
|
||||
|
||||
free_blks = free_segs_blk_count(sbi);
|
||||
ovp_blks = overprovision_segments(sbi) << sbi->log_blocks_per_seg;
|
||||
|
||||
if (free_blks < ovp_blks)
|
||||
return 0;
|
||||
else
|
||||
return (free_segments(sbi) - overprovision_segments(sbi))
|
||||
<< sbi->log_blocks_per_seg;
|
||||
|
||||
return free_blks - ovp_blks;
|
||||
}
|
||||
|
||||
static inline block_t limit_invalid_user_blocks(struct f2fs_sb_info *sbi)
|
||||
|
@ -11,6 +11,7 @@
|
||||
|
||||
#include "f2fs.h"
|
||||
#include "node.h"
|
||||
#include <trace/events/f2fs.h>
|
||||
|
||||
bool f2fs_may_inline_data(struct inode *inode)
|
||||
{
|
||||
@ -129,7 +130,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = f2fs_get_node_info(fio.sbi, dn->nid, &ni);
|
||||
err = f2fs_get_node_info(fio.sbi, dn->nid, &ni, false);
|
||||
if (err) {
|
||||
f2fs_truncate_data_blocks_range(dn, 1);
|
||||
f2fs_put_dnode(dn);
|
||||
@ -171,7 +172,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
|
||||
|
||||
/* clear inline data and flag after data writeback */
|
||||
f2fs_truncate_inline_inode(dn->inode, dn->inode_page, 0);
|
||||
clear_inline_node(dn->inode_page);
|
||||
clear_page_private_inline(dn->inode_page);
|
||||
clear_out:
|
||||
stat_dec_inline_inode(dn->inode);
|
||||
clear_inode_flag(dn->inode, FI_INLINE_DATA);
|
||||
@ -186,9 +187,14 @@ int f2fs_convert_inline_inode(struct inode *inode)
|
||||
struct page *ipage, *page;
|
||||
int err = 0;
|
||||
|
||||
if (!f2fs_has_inline_data(inode))
|
||||
if (!f2fs_has_inline_data(inode) ||
|
||||
f2fs_hw_is_readonly(sbi) || f2fs_readonly(sbi->sb))
|
||||
return 0;
|
||||
|
||||
err = f2fs_dquot_initialize(inode);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
page = f2fs_grab_cache_page(inode->i_mapping, 0, false);
|
||||
if (!page)
|
||||
return -ENOMEM;
|
||||
@ -248,12 +254,12 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
|
||||
set_inode_flag(inode, FI_APPEND_WRITE);
|
||||
set_inode_flag(inode, FI_DATA_EXIST);
|
||||
|
||||
clear_inline_node(dn.inode_page);
|
||||
clear_page_private_inline(dn.inode_page);
|
||||
f2fs_put_dnode(&dn);
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool f2fs_recover_inline_data(struct inode *inode, struct page *npage)
|
||||
int f2fs_recover_inline_data(struct inode *inode, struct page *npage)
|
||||
{
|
||||
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
|
||||
struct f2fs_inode *ri = NULL;
|
||||
@ -265,7 +271,7 @@ bool f2fs_recover_inline_data(struct inode *inode, struct page *npage)
|
||||
* [prev.] [next] of inline_data flag
|
||||
* o o -> recover inline_data
|
||||
* o x -> remove inline_data, and then recover data blocks
|
||||
* x o -> remove inline_data, and then recover inline_data
|
||||
* x o -> remove data blocks, and then recover inline_data
|
||||
* x x -> recover data blocks
|
||||
*/
|
||||
if (IS_INODE(npage))
|
||||
@ -275,7 +281,8 @@ bool f2fs_recover_inline_data(struct inode *inode, struct page *npage)
|
||||
ri && (ri->i_inline & F2FS_INLINE_DATA)) {
|
||||
process_inline:
|
||||
ipage = f2fs_get_node_page(sbi, inode->i_ino);
|
||||
f2fs_bug_on(sbi, IS_ERR(ipage));
|
||||
if (IS_ERR(ipage))
|
||||
return PTR_ERR(ipage);
|
||||
|
||||
f2fs_wait_on_page_writeback(ipage, NODE, true, true);
|
||||
|
||||
@ -288,21 +295,27 @@ process_inline:
|
||||
|
||||
set_page_dirty(ipage);
|
||||
f2fs_put_page(ipage, 1);
|
||||
return true;
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (f2fs_has_inline_data(inode)) {
|
||||
ipage = f2fs_get_node_page(sbi, inode->i_ino);
|
||||
f2fs_bug_on(sbi, IS_ERR(ipage));
|
||||
if (IS_ERR(ipage))
|
||||
return PTR_ERR(ipage);
|
||||
f2fs_truncate_inline_inode(inode, ipage, 0);
|
||||
stat_dec_inline_inode(inode);
|
||||
clear_inode_flag(inode, FI_INLINE_DATA);
|
||||
f2fs_put_page(ipage, 1);
|
||||
} else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
|
||||
if (f2fs_truncate_blocks(inode, 0, false))
|
||||
return false;
|
||||
int ret;
|
||||
|
||||
ret = f2fs_truncate_blocks(inode, 0, false);
|
||||
if (ret)
|
||||
return ret;
|
||||
stat_inc_inline_inode(inode);
|
||||
goto process_inline;
|
||||
}
|
||||
return false;
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir,
|
||||
@ -326,6 +339,10 @@ struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir,
|
||||
make_dentry_ptr_inline(dir, &d, inline_dentry);
|
||||
de = f2fs_find_target_dentry(&d, fname, NULL);
|
||||
unlock_page(ipage);
|
||||
if (IS_ERR(de)) {
|
||||
*res_page = ERR_CAST(de);
|
||||
de = NULL;
|
||||
}
|
||||
if (de)
|
||||
*res_page = ipage;
|
||||
else
|
||||
@ -518,7 +535,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
|
||||
!f2fs_has_inline_xattr(dir))
|
||||
F2FS_I(dir)->i_inline_xattr_size = 0;
|
||||
|
||||
kvfree(backup_dentry);
|
||||
kfree(backup_dentry);
|
||||
return 0;
|
||||
recover:
|
||||
lock_page(ipage);
|
||||
@ -529,7 +546,7 @@ recover:
|
||||
set_page_dirty(ipage);
|
||||
f2fs_put_page(ipage, 1);
|
||||
|
||||
kvfree(backup_dentry);
|
||||
kfree(backup_dentry);
|
||||
return err;
|
||||
}
|
||||
|
||||
@ -611,7 +628,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname,
|
||||
}
|
||||
|
||||
if (inode) {
|
||||
down_write(&F2FS_I(inode)->i_sem);
|
||||
f2fs_down_write(&F2FS_I(inode)->i_sem);
|
||||
page = f2fs_init_inode_metadata(inode, dir, fname, ipage);
|
||||
if (IS_ERR(page)) {
|
||||
err = PTR_ERR(page);
|
||||
@ -640,7 +657,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname,
|
||||
f2fs_update_parent_metadata(dir, inode, 0);
|
||||
fail:
|
||||
if (inode)
|
||||
up_write(&F2FS_I(inode)->i_sem);
|
||||
f2fs_up_write(&F2FS_I(inode)->i_sem);
|
||||
out:
|
||||
f2fs_put_page(ipage, 1);
|
||||
return err;
|
||||
@ -768,7 +785,7 @@ int f2fs_inline_data_fiemap(struct inode *inode,
|
||||
ilen = start + len;
|
||||
ilen -= start;
|
||||
|
||||
err = f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni);
|
||||
err = f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni, false);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
@ -776,6 +793,7 @@ int f2fs_inline_data_fiemap(struct inode *inode,
|
||||
byteaddr += (char *)inline_data_addr(inode, ipage) -
|
||||
(char *)F2FS_INODE(ipage);
|
||||
err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags);
|
||||
trace_f2fs_fiemap(inode, start, byteaddr, ilen, flags, err);
|
||||
out:
|
||||
f2fs_put_page(ipage, 1);
|
||||
return err;
|
||||
|
@ -18,6 +18,10 @@
|
||||
|
||||
#include <trace/events/f2fs.h>
|
||||
|
||||
#ifdef CONFIG_F2FS_FS_COMPRESSION
|
||||
extern const struct address_space_operations f2fs_compress_aops;
|
||||
#endif
|
||||
|
||||
void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync)
|
||||
{
|
||||
if (is_inode_flag_set(inode, FI_NEW_INODE))
|
||||
@ -287,11 +291,19 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
|
||||
return false;
|
||||
}
|
||||
|
||||
if ((fi->i_flags & F2FS_CASEFOLD_FL) && !f2fs_sb_has_casefold(sbi)) {
|
||||
set_sbi_flag(sbi, SBI_NEED_FSCK);
|
||||
f2fs_warn(sbi, "%s: inode (ino=%lx) has casefold flag, but casefold feature is off",
|
||||
__func__, inode->i_ino);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi) &&
|
||||
fi->i_flags & F2FS_COMPR_FL &&
|
||||
F2FS_FITS_IN_INODE(ri, fi->i_extra_isize,
|
||||
i_log_cluster_size)) {
|
||||
if (ri->i_compress_algorithm >= COMPRESS_MAX) {
|
||||
set_sbi_flag(sbi, SBI_NEED_FSCK);
|
||||
f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported "
|
||||
"compress algorithm: %u, run fsck to fix",
|
||||
__func__, inode->i_ino,
|
||||
@ -300,6 +312,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
|
||||
}
|
||||
if (le64_to_cpu(ri->i_compr_blocks) >
|
||||
SECTOR_TO_BLOCK(inode->i_blocks)) {
|
||||
set_sbi_flag(sbi, SBI_NEED_FSCK);
|
||||
f2fs_warn(sbi, "%s: inode (ino=%lx) has inconsistent "
|
||||
"i_compr_blocks:%llu, i_blocks:%lu, run fsck to fix",
|
||||
__func__, inode->i_ino,
|
||||
@ -309,6 +322,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
|
||||
}
|
||||
if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE ||
|
||||
ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) {
|
||||
set_sbi_flag(sbi, SBI_NEED_FSCK);
|
||||
f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported "
|
||||
"log cluster size: %u, run fsck to fix",
|
||||
__func__, inode->i_ino,
|
||||
@ -367,8 +381,7 @@ static int do_read_inode(struct inode *inode)
|
||||
fi->i_pino = le32_to_cpu(ri->i_pino);
|
||||
fi->i_dir_level = ri->i_dir_level;
|
||||
|
||||
if (f2fs_init_extent_tree(inode, &ri->i_ext))
|
||||
set_page_dirty(node_page);
|
||||
f2fs_init_extent_tree(inode, node_page);
|
||||
|
||||
get_inline_info(inode, ri);
|
||||
|
||||
@ -402,6 +415,7 @@ static int do_read_inode(struct inode *inode)
|
||||
|
||||
/* try to recover cold bit for non-dir inode */
|
||||
if (!S_ISDIR(inode->i_mode) && !is_cold_node(node_page)) {
|
||||
f2fs_wait_on_page_writeback(node_page, NODE, true, true);
|
||||
set_cold_node(node_page, false);
|
||||
set_page_dirty(node_page);
|
||||
}
|
||||
@ -442,9 +456,11 @@ static int do_read_inode(struct inode *inode)
|
||||
(fi->i_flags & F2FS_COMPR_FL)) {
|
||||
if (F2FS_FITS_IN_INODE(ri, fi->i_extra_isize,
|
||||
i_log_cluster_size)) {
|
||||
fi->i_compr_blocks = le64_to_cpu(ri->i_compr_blocks);
|
||||
atomic_set(&fi->i_compr_blocks,
|
||||
le64_to_cpu(ri->i_compr_blocks));
|
||||
fi->i_compress_algorithm = ri->i_compress_algorithm;
|
||||
fi->i_log_cluster_size = ri->i_log_cluster_size;
|
||||
fi->i_compress_flag = le16_to_cpu(ri->i_compress_flag);
|
||||
fi->i_cluster_size = 1 << fi->i_log_cluster_size;
|
||||
set_inode_flag(inode, FI_COMPRESSED_FILE);
|
||||
}
|
||||
@ -460,7 +476,7 @@ static int do_read_inode(struct inode *inode)
|
||||
stat_inc_inline_inode(inode);
|
||||
stat_inc_inline_dir(inode);
|
||||
stat_inc_compr_inode(inode);
|
||||
stat_add_compr_blocks(inode, F2FS_I(inode)->i_compr_blocks);
|
||||
stat_add_compr_blocks(inode, atomic_read(&fi->i_compr_blocks));
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -482,6 +498,11 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
|
||||
if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi))
|
||||
goto make_now;
|
||||
|
||||
#ifdef CONFIG_F2FS_FS_COMPRESSION
|
||||
if (ino == F2FS_COMPRESS_INO(sbi))
|
||||
goto make_now;
|
||||
#endif
|
||||
|
||||
ret = do_read_inode(inode);
|
||||
if (ret)
|
||||
goto bad_inode;
|
||||
@ -492,6 +513,17 @@ make_now:
|
||||
} else if (ino == F2FS_META_INO(sbi)) {
|
||||
inode->i_mapping->a_ops = &f2fs_meta_aops;
|
||||
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
|
||||
} else if (ino == F2FS_COMPRESS_INO(sbi)) {
|
||||
#ifdef CONFIG_F2FS_FS_COMPRESSION
|
||||
inode->i_mapping->a_ops = &f2fs_compress_aops;
|
||||
/*
|
||||
* generic_error_remove_page only truncates pages of regular
|
||||
* inode
|
||||
*/
|
||||
inode->i_mode |= S_IFREG;
|
||||
#endif
|
||||
mapping_set_gfp_mask(inode->i_mapping,
|
||||
GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE);
|
||||
} else if (S_ISREG(inode->i_mode)) {
|
||||
inode->i_op = &f2fs_file_inode_operations;
|
||||
inode->i_fop = &f2fs_file_operations;
|
||||
@ -500,7 +532,7 @@ make_now:
|
||||
inode->i_op = &f2fs_dir_inode_operations;
|
||||
inode->i_fop = &f2fs_dir_operations;
|
||||
inode->i_mapping->a_ops = &f2fs_dblock_aops;
|
||||
inode_nohighmem(inode);
|
||||
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
|
||||
} else if (S_ISLNK(inode->i_mode)) {
|
||||
if (file_is_encrypt(inode))
|
||||
inode->i_op = &f2fs_encrypted_symlink_inode_operations;
|
||||
@ -517,6 +549,14 @@ make_now:
|
||||
goto bad_inode;
|
||||
}
|
||||
f2fs_set_inode_flags(inode);
|
||||
|
||||
if (file_should_truncate(inode)) {
|
||||
ret = f2fs_truncate(inode);
|
||||
if (ret)
|
||||
goto bad_inode;
|
||||
file_dont_truncate(inode);
|
||||
}
|
||||
|
||||
unlock_new_inode(inode);
|
||||
trace_f2fs_iget(inode);
|
||||
return inode;
|
||||
@ -619,9 +659,12 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
|
||||
F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
|
||||
i_log_cluster_size)) {
|
||||
ri->i_compr_blocks =
|
||||
cpu_to_le64(F2FS_I(inode)->i_compr_blocks);
|
||||
cpu_to_le64(atomic_read(
|
||||
&F2FS_I(inode)->i_compr_blocks));
|
||||
ri->i_compress_algorithm =
|
||||
F2FS_I(inode)->i_compress_algorithm;
|
||||
ri->i_compress_flag =
|
||||
cpu_to_le16(F2FS_I(inode)->i_compress_flag);
|
||||
ri->i_log_cluster_size =
|
||||
F2FS_I(inode)->i_log_cluster_size;
|
||||
}
|
||||
@ -631,7 +674,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
|
||||
|
||||
/* deleted inode */
|
||||
if (inode->i_nlink == 0)
|
||||
clear_inline_node(node_page);
|
||||
clear_page_private_inline(node_page);
|
||||
|
||||
F2FS_I(inode)->i_disk_time[0] = inode->i_atime;
|
||||
F2FS_I(inode)->i_disk_time[1] = inode->i_ctime;
|
||||
@ -651,6 +694,7 @@ retry:
|
||||
node_page = f2fs_get_node_page(sbi, inode->i_ino);
|
||||
if (IS_ERR(node_page)) {
|
||||
int err = PTR_ERR(node_page);
|
||||
|
||||
if (err == -ENOMEM) {
|
||||
cond_resched();
|
||||
goto retry;
|
||||
@ -683,7 +727,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
|
||||
|
||||
/*
|
||||
* We need to balance fs here to prevent from producing dirty node pages
|
||||
* during the urgent cleaning time when runing out of free sections.
|
||||
* during the urgent cleaning time when running out of free sections.
|
||||
*/
|
||||
f2fs_update_inode_page(inode);
|
||||
if (wbc && wbc->nr_to_write)
|
||||
@ -707,8 +751,13 @@ void f2fs_evict_inode(struct inode *inode)
|
||||
trace_f2fs_evict_inode(inode);
|
||||
truncate_inode_pages_final(&inode->i_data);
|
||||
|
||||
if ((inode->i_nlink || is_bad_inode(inode)) &&
|
||||
test_opt(sbi, COMPRESS_CACHE) && f2fs_compressed_file(inode))
|
||||
f2fs_invalidate_compress_pages(sbi, inode->i_ino);
|
||||
|
||||
if (inode->i_ino == F2FS_NODE_INO(sbi) ||
|
||||
inode->i_ino == F2FS_META_INO(sbi))
|
||||
inode->i_ino == F2FS_META_INO(sbi) ||
|
||||
inode->i_ino == F2FS_COMPRESS_INO(sbi))
|
||||
goto out_clear;
|
||||
|
||||
f2fs_bug_on(sbi, get_dirty_pages(inode));
|
||||
@ -719,7 +768,7 @@ void f2fs_evict_inode(struct inode *inode)
|
||||
if (inode->i_nlink || is_bad_inode(inode))
|
||||
goto no_delete;
|
||||
|
||||
err = dquot_initialize(inode);
|
||||
err = f2fs_dquot_initialize(inode);
|
||||
if (err) {
|
||||
err = 0;
|
||||
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
|
||||
@ -729,7 +778,8 @@ void f2fs_evict_inode(struct inode *inode)
|
||||
f2fs_remove_ino_entry(sbi, inode->i_ino, UPDATE_INO);
|
||||
f2fs_remove_ino_entry(sbi, inode->i_ino, FLUSH_INO);
|
||||
|
||||
sb_start_intwrite(inode->i_sb);
|
||||
if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING))
|
||||
sb_start_intwrite(inode->i_sb);
|
||||
set_inode_flag(inode, FI_NO_ALLOC);
|
||||
i_size_write(inode, 0);
|
||||
retry:
|
||||
@ -760,7 +810,8 @@ retry:
|
||||
if (dquot_initialize_needed(inode))
|
||||
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
|
||||
}
|
||||
sb_end_intwrite(inode->i_sb);
|
||||
if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING))
|
||||
sb_end_intwrite(inode->i_sb);
|
||||
no_delete:
|
||||
dquot_drop(inode);
|
||||
|
||||
@ -768,7 +819,8 @@ no_delete:
|
||||
stat_dec_inline_dir(inode);
|
||||
stat_dec_inline_inode(inode);
|
||||
stat_dec_compr_inode(inode);
|
||||
stat_sub_compr_blocks(inode, F2FS_I(inode)->i_compr_blocks);
|
||||
stat_sub_compr_blocks(inode,
|
||||
atomic_read(&F2FS_I(inode)->i_compr_blocks));
|
||||
|
||||
if (unlikely(is_inode_flag_set(inode, FI_DIRTY_INODE))) {
|
||||
f2fs_inode_synced(inode);
|
||||
@ -835,9 +887,10 @@ void f2fs_handle_failed_inode(struct inode *inode)
|
||||
* so we can prevent losing this orphan when encoutering checkpoint
|
||||
* and following suddenly power-off.
|
||||
*/
|
||||
err = f2fs_get_node_info(sbi, inode->i_ino, &ni);
|
||||
err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false);
|
||||
if (err) {
|
||||
set_sbi_flag(sbi, SBI_NEED_FSCK);
|
||||
set_inode_flag(inode, FI_FREE_NID);
|
||||
f2fs_warn(sbi, "May loss orphan inode, run fsck to fix.");
|
||||
goto out;
|
||||
}
|
||||
|
287
fs/f2fs/iostat.c
Normal file
287
fs/f2fs/iostat.c
Normal file
@ -0,0 +1,287 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* f2fs iostat support
|
||||
*
|
||||
* Copyright 2021 Google LLC
|
||||
* Author: Daeho Jeong <daehojeong@google.com>
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/f2fs_fs.h>
|
||||
#include <linux/seq_file.h>
|
||||
|
||||
#include "f2fs.h"
|
||||
#include "iostat.h"
|
||||
#include <trace/events/f2fs.h>
|
||||
|
||||
#define NUM_PREALLOC_IOSTAT_CTXS 128
|
||||
static struct kmem_cache *bio_iostat_ctx_cache;
|
||||
static mempool_t *bio_iostat_ctx_pool;
|
||||
|
||||
int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset)
|
||||
{
|
||||
struct super_block *sb = seq->private;
|
||||
struct f2fs_sb_info *sbi = F2FS_SB(sb);
|
||||
time64_t now = ktime_get_real_seconds();
|
||||
|
||||
if (!sbi->iostat_enable)
|
||||
return 0;
|
||||
|
||||
seq_printf(seq, "time: %-16llu\n", now);
|
||||
|
||||
/* print app write IOs */
|
||||
seq_puts(seq, "[WRITE]\n");
|
||||
seq_printf(seq, "app buffered: %-16llu\n",
|
||||
sbi->rw_iostat[APP_BUFFERED_IO]);
|
||||
seq_printf(seq, "app direct: %-16llu\n",
|
||||
sbi->rw_iostat[APP_DIRECT_IO]);
|
||||
seq_printf(seq, "app mapped: %-16llu\n",
|
||||
sbi->rw_iostat[APP_MAPPED_IO]);
|
||||
|
||||
/* print fs write IOs */
|
||||
seq_printf(seq, "fs data: %-16llu\n",
|
||||
sbi->rw_iostat[FS_DATA_IO]);
|
||||
seq_printf(seq, "fs node: %-16llu\n",
|
||||
sbi->rw_iostat[FS_NODE_IO]);
|
||||
seq_printf(seq, "fs meta: %-16llu\n",
|
||||
sbi->rw_iostat[FS_META_IO]);
|
||||
seq_printf(seq, "fs gc data: %-16llu\n",
|
||||
sbi->rw_iostat[FS_GC_DATA_IO]);
|
||||
seq_printf(seq, "fs gc node: %-16llu\n",
|
||||
sbi->rw_iostat[FS_GC_NODE_IO]);
|
||||
seq_printf(seq, "fs cp data: %-16llu\n",
|
||||
sbi->rw_iostat[FS_CP_DATA_IO]);
|
||||
seq_printf(seq, "fs cp node: %-16llu\n",
|
||||
sbi->rw_iostat[FS_CP_NODE_IO]);
|
||||
seq_printf(seq, "fs cp meta: %-16llu\n",
|
||||
sbi->rw_iostat[FS_CP_META_IO]);
|
||||
|
||||
/* print app read IOs */
|
||||
seq_puts(seq, "[READ]\n");
|
||||
seq_printf(seq, "app buffered: %-16llu\n",
|
||||
sbi->rw_iostat[APP_BUFFERED_READ_IO]);
|
||||
seq_printf(seq, "app direct: %-16llu\n",
|
||||
sbi->rw_iostat[APP_DIRECT_READ_IO]);
|
||||
seq_printf(seq, "app mapped: %-16llu\n",
|
||||
sbi->rw_iostat[APP_MAPPED_READ_IO]);
|
||||
|
||||
/* print fs read IOs */
|
||||
seq_printf(seq, "fs data: %-16llu\n",
|
||||
sbi->rw_iostat[FS_DATA_READ_IO]);
|
||||
seq_printf(seq, "fs gc data: %-16llu\n",
|
||||
sbi->rw_iostat[FS_GDATA_READ_IO]);
|
||||
seq_printf(seq, "fs compr_data: %-16llu\n",
|
||||
sbi->rw_iostat[FS_CDATA_READ_IO]);
|
||||
seq_printf(seq, "fs node: %-16llu\n",
|
||||
sbi->rw_iostat[FS_NODE_READ_IO]);
|
||||
seq_printf(seq, "fs meta: %-16llu\n",
|
||||
sbi->rw_iostat[FS_META_READ_IO]);
|
||||
|
||||
/* print other IOs */
|
||||
seq_puts(seq, "[OTHER]\n");
|
||||
seq_printf(seq, "fs discard: %-16llu\n",
|
||||
sbi->rw_iostat[FS_DISCARD]);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void __record_iostat_latency(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
int io, idx = 0;
|
||||
unsigned int cnt;
|
||||
struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE];
|
||||
struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
|
||||
|
||||
spin_lock_irq(&sbi->iostat_lat_lock);
|
||||
for (idx = 0; idx < MAX_IO_TYPE; idx++) {
|
||||
for (io = 0; io < NR_PAGE_TYPE; io++) {
|
||||
cnt = io_lat->bio_cnt[idx][io];
|
||||
iostat_lat[idx][io].peak_lat =
|
||||
jiffies_to_msecs(io_lat->peak_lat[idx][io]);
|
||||
iostat_lat[idx][io].cnt = cnt;
|
||||
iostat_lat[idx][io].avg_lat = cnt ?
|
||||
jiffies_to_msecs(io_lat->sum_lat[idx][io]) / cnt : 0;
|
||||
io_lat->sum_lat[idx][io] = 0;
|
||||
io_lat->peak_lat[idx][io] = 0;
|
||||
io_lat->bio_cnt[idx][io] = 0;
|
||||
}
|
||||
}
|
||||
spin_unlock_irq(&sbi->iostat_lat_lock);
|
||||
|
||||
trace_f2fs_iostat_latency(sbi, iostat_lat);
|
||||
}
|
||||
|
||||
static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
unsigned long long iostat_diff[NR_IO_TYPE];
|
||||
int i;
|
||||
|
||||
if (time_is_after_jiffies(sbi->iostat_next_period))
|
||||
return;
|
||||
|
||||
/* Need double check under the lock */
|
||||
spin_lock(&sbi->iostat_lock);
|
||||
if (time_is_after_jiffies(sbi->iostat_next_period)) {
|
||||
spin_unlock(&sbi->iostat_lock);
|
||||
return;
|
||||
}
|
||||
sbi->iostat_next_period = jiffies +
|
||||
msecs_to_jiffies(sbi->iostat_period_ms);
|
||||
|
||||
for (i = 0; i < NR_IO_TYPE; i++) {
|
||||
iostat_diff[i] = sbi->rw_iostat[i] -
|
||||
sbi->prev_rw_iostat[i];
|
||||
sbi->prev_rw_iostat[i] = sbi->rw_iostat[i];
|
||||
}
|
||||
spin_unlock(&sbi->iostat_lock);
|
||||
|
||||
trace_f2fs_iostat(sbi, iostat_diff);
|
||||
|
||||
__record_iostat_latency(sbi);
|
||||
}
|
||||
|
||||
void f2fs_reset_iostat(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
|
||||
int i;
|
||||
|
||||
spin_lock(&sbi->iostat_lock);
|
||||
for (i = 0; i < NR_IO_TYPE; i++) {
|
||||
sbi->rw_iostat[i] = 0;
|
||||
sbi->prev_rw_iostat[i] = 0;
|
||||
}
|
||||
spin_unlock(&sbi->iostat_lock);
|
||||
|
||||
spin_lock_irq(&sbi->iostat_lat_lock);
|
||||
memset(io_lat, 0, sizeof(struct iostat_lat_info));
|
||||
spin_unlock_irq(&sbi->iostat_lat_lock);
|
||||
}
|
||||
|
||||
void f2fs_update_iostat(struct f2fs_sb_info *sbi,
|
||||
enum iostat_type type, unsigned long long io_bytes)
|
||||
{
|
||||
if (!sbi->iostat_enable)
|
||||
return;
|
||||
|
||||
spin_lock(&sbi->iostat_lock);
|
||||
sbi->rw_iostat[type] += io_bytes;
|
||||
|
||||
if (type == APP_WRITE_IO || type == APP_DIRECT_IO)
|
||||
sbi->rw_iostat[APP_BUFFERED_IO] =
|
||||
sbi->rw_iostat[APP_WRITE_IO] -
|
||||
sbi->rw_iostat[APP_DIRECT_IO];
|
||||
|
||||
if (type == APP_READ_IO || type == APP_DIRECT_READ_IO)
|
||||
sbi->rw_iostat[APP_BUFFERED_READ_IO] =
|
||||
sbi->rw_iostat[APP_READ_IO] -
|
||||
sbi->rw_iostat[APP_DIRECT_READ_IO];
|
||||
spin_unlock(&sbi->iostat_lock);
|
||||
|
||||
f2fs_record_iostat(sbi);
|
||||
}
|
||||
|
||||
static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx,
|
||||
int rw, bool is_sync)
|
||||
{
|
||||
unsigned long ts_diff;
|
||||
unsigned int iotype = iostat_ctx->type;
|
||||
unsigned long flags;
|
||||
struct f2fs_sb_info *sbi = iostat_ctx->sbi;
|
||||
struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
|
||||
int idx;
|
||||
|
||||
if (!sbi->iostat_enable)
|
||||
return;
|
||||
|
||||
ts_diff = jiffies - iostat_ctx->submit_ts;
|
||||
if (iotype >= META_FLUSH)
|
||||
iotype = META;
|
||||
|
||||
if (rw == 0) {
|
||||
idx = READ_IO;
|
||||
} else {
|
||||
if (is_sync)
|
||||
idx = WRITE_SYNC_IO;
|
||||
else
|
||||
idx = WRITE_ASYNC_IO;
|
||||
}
|
||||
|
||||
spin_lock_irqsave(&sbi->iostat_lat_lock, flags);
|
||||
io_lat->sum_lat[idx][iotype] += ts_diff;
|
||||
io_lat->bio_cnt[idx][iotype]++;
|
||||
if (ts_diff > io_lat->peak_lat[idx][iotype])
|
||||
io_lat->peak_lat[idx][iotype] = ts_diff;
|
||||
spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags);
|
||||
}
|
||||
|
||||
void iostat_update_and_unbind_ctx(struct bio *bio, int rw)
|
||||
{
|
||||
struct bio_iostat_ctx *iostat_ctx = bio->bi_private;
|
||||
bool is_sync = bio->bi_opf & REQ_SYNC;
|
||||
|
||||
if (rw == 0)
|
||||
bio->bi_private = iostat_ctx->post_read_ctx;
|
||||
else
|
||||
bio->bi_private = iostat_ctx->sbi;
|
||||
__update_iostat_latency(iostat_ctx, rw, is_sync);
|
||||
mempool_free(iostat_ctx, bio_iostat_ctx_pool);
|
||||
}
|
||||
|
||||
void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi,
|
||||
struct bio *bio, struct bio_post_read_ctx *ctx)
|
||||
{
|
||||
struct bio_iostat_ctx *iostat_ctx;
|
||||
/* Due to the mempool, this never fails. */
|
||||
iostat_ctx = mempool_alloc(bio_iostat_ctx_pool, GFP_NOFS);
|
||||
iostat_ctx->sbi = sbi;
|
||||
iostat_ctx->submit_ts = 0;
|
||||
iostat_ctx->type = 0;
|
||||
iostat_ctx->post_read_ctx = ctx;
|
||||
bio->bi_private = iostat_ctx;
|
||||
}
|
||||
|
||||
int __init f2fs_init_iostat_processing(void)
|
||||
{
|
||||
bio_iostat_ctx_cache =
|
||||
kmem_cache_create("f2fs_bio_iostat_ctx",
|
||||
sizeof(struct bio_iostat_ctx), 0, 0, NULL);
|
||||
if (!bio_iostat_ctx_cache)
|
||||
goto fail;
|
||||
bio_iostat_ctx_pool =
|
||||
mempool_create_slab_pool(NUM_PREALLOC_IOSTAT_CTXS,
|
||||
bio_iostat_ctx_cache);
|
||||
if (!bio_iostat_ctx_pool)
|
||||
goto fail_free_cache;
|
||||
return 0;
|
||||
|
||||
fail_free_cache:
|
||||
kmem_cache_destroy(bio_iostat_ctx_cache);
|
||||
fail:
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
void f2fs_destroy_iostat_processing(void)
|
||||
{
|
||||
mempool_destroy(bio_iostat_ctx_pool);
|
||||
kmem_cache_destroy(bio_iostat_ctx_cache);
|
||||
}
|
||||
|
||||
int f2fs_init_iostat(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
/* init iostat info */
|
||||
spin_lock_init(&sbi->iostat_lock);
|
||||
spin_lock_init(&sbi->iostat_lat_lock);
|
||||
sbi->iostat_enable = false;
|
||||
sbi->iostat_period_ms = DEFAULT_IOSTAT_PERIOD_MS;
|
||||
sbi->iostat_io_lat = f2fs_kzalloc(sbi, sizeof(struct iostat_lat_info),
|
||||
GFP_KERNEL);
|
||||
if (!sbi->iostat_io_lat)
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void f2fs_destroy_iostat(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
kfree(sbi->iostat_io_lat);
|
||||
}
|
84
fs/f2fs/iostat.h
Normal file
84
fs/f2fs/iostat.h
Normal file
@ -0,0 +1,84 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* Copyright 2021 Google LLC
|
||||
* Author: Daeho Jeong <daehojeong@google.com>
|
||||
*/
|
||||
#ifndef __F2FS_IOSTAT_H__
|
||||
#define __F2FS_IOSTAT_H__
|
||||
|
||||
struct bio_post_read_ctx;
|
||||
|
||||
#ifdef CONFIG_F2FS_IOSTAT
|
||||
|
||||
#define DEFAULT_IOSTAT_PERIOD_MS 3000
|
||||
#define MIN_IOSTAT_PERIOD_MS 100
|
||||
/* maximum period of iostat tracing is 1 day */
|
||||
#define MAX_IOSTAT_PERIOD_MS 8640000
|
||||
|
||||
enum {
|
||||
READ_IO,
|
||||
WRITE_SYNC_IO,
|
||||
WRITE_ASYNC_IO,
|
||||
MAX_IO_TYPE,
|
||||
};
|
||||
|
||||
struct iostat_lat_info {
|
||||
unsigned long sum_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; /* sum of io latencies */
|
||||
unsigned long peak_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; /* peak io latency */
|
||||
unsigned int bio_cnt[MAX_IO_TYPE][NR_PAGE_TYPE]; /* bio count */
|
||||
};
|
||||
|
||||
extern int __maybe_unused iostat_info_seq_show(struct seq_file *seq,
|
||||
void *offset);
|
||||
extern void f2fs_reset_iostat(struct f2fs_sb_info *sbi);
|
||||
extern void f2fs_update_iostat(struct f2fs_sb_info *sbi,
|
||||
enum iostat_type type, unsigned long long io_bytes);
|
||||
|
||||
struct bio_iostat_ctx {
|
||||
struct f2fs_sb_info *sbi;
|
||||
unsigned long submit_ts;
|
||||
enum page_type type;
|
||||
struct bio_post_read_ctx *post_read_ctx;
|
||||
};
|
||||
|
||||
static inline void iostat_update_submit_ctx(struct bio *bio,
|
||||
enum page_type type)
|
||||
{
|
||||
struct bio_iostat_ctx *iostat_ctx = bio->bi_private;
|
||||
|
||||
iostat_ctx->submit_ts = jiffies;
|
||||
iostat_ctx->type = type;
|
||||
}
|
||||
|
||||
static inline struct bio_post_read_ctx *get_post_read_ctx(struct bio *bio)
|
||||
{
|
||||
struct bio_iostat_ctx *iostat_ctx = bio->bi_private;
|
||||
|
||||
return iostat_ctx->post_read_ctx;
|
||||
}
|
||||
|
||||
extern void iostat_update_and_unbind_ctx(struct bio *bio, int rw);
|
||||
extern void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi,
|
||||
struct bio *bio, struct bio_post_read_ctx *ctx);
|
||||
extern int f2fs_init_iostat_processing(void);
|
||||
extern void f2fs_destroy_iostat_processing(void);
|
||||
extern int f2fs_init_iostat(struct f2fs_sb_info *sbi);
|
||||
extern void f2fs_destroy_iostat(struct f2fs_sb_info *sbi);
|
||||
#else
|
||||
static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi,
|
||||
enum iostat_type type, unsigned long long io_bytes) {}
|
||||
static inline void iostat_update_and_unbind_ctx(struct bio *bio, int rw) {}
|
||||
static inline void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi,
|
||||
struct bio *bio, struct bio_post_read_ctx *ctx) {}
|
||||
static inline void iostat_update_submit_ctx(struct bio *bio,
|
||||
enum page_type type) {}
|
||||
static inline struct bio_post_read_ctx *get_post_read_ctx(struct bio *bio)
|
||||
{
|
||||
return bio->bi_private;
|
||||
}
|
||||
static inline int f2fs_init_iostat_processing(void) { return 0; }
|
||||
static inline void f2fs_destroy_iostat_processing(void) {}
|
||||
static inline int f2fs_init_iostat(struct f2fs_sb_info *sbi) { return 0; }
|
||||
static inline void f2fs_destroy_iostat(struct f2fs_sb_info *sbi) {}
|
||||
#endif
|
||||
#endif /* __F2FS_IOSTAT_H__ */
|
138
fs/f2fs/namei.c
138
fs/f2fs/namei.c
@ -69,7 +69,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
|
||||
F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns,
|
||||
F2FS_DEF_PROJID);
|
||||
|
||||
err = dquot_initialize(inode);
|
||||
err = f2fs_dquot_initialize(inode);
|
||||
if (err)
|
||||
goto fail_drop;
|
||||
|
||||
@ -148,7 +148,8 @@ fail_drop:
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
static inline int is_extension_exist(const unsigned char *s, const char *sub)
|
||||
static inline int is_extension_exist(const unsigned char *s, const char *sub,
|
||||
bool tmp_ext)
|
||||
{
|
||||
size_t slen = strlen(s);
|
||||
size_t sublen = strlen(sub);
|
||||
@ -164,6 +165,13 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub)
|
||||
if (slen < sublen + 2)
|
||||
return 0;
|
||||
|
||||
if (!tmp_ext) {
|
||||
/* file has no temp extension */
|
||||
if (s[slen - sublen - 1] != '.')
|
||||
return 0;
|
||||
return !strncasecmp(s + slen - sublen, sub, sublen);
|
||||
}
|
||||
|
||||
for (i = 1; i < slen - sublen; i++) {
|
||||
if (s[i] != '.')
|
||||
continue;
|
||||
@ -183,17 +191,17 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *
|
||||
__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
|
||||
int i, cold_count, hot_count;
|
||||
|
||||
down_read(&sbi->sb_lock);
|
||||
f2fs_down_read(&sbi->sb_lock);
|
||||
|
||||
cold_count = le32_to_cpu(sbi->raw_super->extension_count);
|
||||
hot_count = sbi->raw_super->hot_ext_count;
|
||||
|
||||
for (i = 0; i < cold_count + hot_count; i++) {
|
||||
if (is_extension_exist(name, extlist[i]))
|
||||
if (is_extension_exist(name, extlist[i], true))
|
||||
break;
|
||||
}
|
||||
|
||||
up_read(&sbi->sb_lock);
|
||||
f2fs_up_read(&sbi->sb_lock);
|
||||
|
||||
if (i == cold_count + hot_count)
|
||||
return;
|
||||
@ -274,34 +282,44 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
|
||||
const unsigned char *name)
|
||||
{
|
||||
__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
|
||||
unsigned char (*ext)[F2FS_EXTENSION_LEN];
|
||||
unsigned int ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
|
||||
unsigned char (*noext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).noextensions;
|
||||
unsigned char (*ext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).extensions;
|
||||
unsigned char ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
|
||||
unsigned char noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt;
|
||||
int i, cold_count, hot_count;
|
||||
|
||||
if (!f2fs_sb_has_compression(sbi) ||
|
||||
is_inode_flag_set(inode, FI_COMPRESSED_FILE) ||
|
||||
F2FS_I(inode)->i_flags & F2FS_NOCOMP_FL ||
|
||||
!f2fs_may_compress(inode))
|
||||
!f2fs_may_compress(inode) ||
|
||||
(!ext_cnt && !noext_cnt))
|
||||
return;
|
||||
|
||||
down_read(&sbi->sb_lock);
|
||||
f2fs_down_read(&sbi->sb_lock);
|
||||
|
||||
cold_count = le32_to_cpu(sbi->raw_super->extension_count);
|
||||
hot_count = sbi->raw_super->hot_ext_count;
|
||||
|
||||
for (i = cold_count; i < cold_count + hot_count; i++) {
|
||||
if (is_extension_exist(name, extlist[i])) {
|
||||
up_read(&sbi->sb_lock);
|
||||
if (is_extension_exist(name, extlist[i], false)) {
|
||||
f2fs_up_read(&sbi->sb_lock);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
up_read(&sbi->sb_lock);
|
||||
f2fs_up_read(&sbi->sb_lock);
|
||||
|
||||
ext = F2FS_OPTION(sbi).extensions;
|
||||
for (i = 0; i < noext_cnt; i++) {
|
||||
if (is_extension_exist(name, noext[i], false)) {
|
||||
f2fs_disable_compressed_file(inode);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (is_inode_flag_set(inode, FI_COMPRESSED_FILE))
|
||||
return;
|
||||
|
||||
for (i = 0; i < ext_cnt; i++) {
|
||||
if (!is_extension_exist(name, ext[i]))
|
||||
if (!is_extension_exist(name, ext[i], false))
|
||||
continue;
|
||||
|
||||
set_compress_context(inode);
|
||||
@ -322,7 +340,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
|
||||
if (!f2fs_is_checkpoint_ready(sbi))
|
||||
return -ENOSPC;
|
||||
|
||||
err = dquot_initialize(dir);
|
||||
err = f2fs_dquot_initialize(dir);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
@ -381,7 +399,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
|
||||
F2FS_I(old_dentry->d_inode)->i_projid)))
|
||||
return -EXDEV;
|
||||
|
||||
err = dquot_initialize(dir);
|
||||
err = f2fs_dquot_initialize(dir);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
@ -414,6 +432,7 @@ struct dentry *f2fs_get_parent(struct dentry *child)
|
||||
struct qstr dotdot = QSTR_INIT("..", 2);
|
||||
struct page *page;
|
||||
unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot, &page);
|
||||
|
||||
if (!ino) {
|
||||
if (IS_ERR(page))
|
||||
return ERR_CAST(page);
|
||||
@ -437,7 +456,7 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
|
||||
return 0;
|
||||
}
|
||||
|
||||
err = dquot_initialize(dir);
|
||||
err = f2fs_dquot_initialize(dir);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
@ -492,7 +511,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
|
||||
}
|
||||
|
||||
err = f2fs_prepare_lookup(dir, dentry, &fname);
|
||||
generic_set_encrypted_ci_d_ops(dir, dentry);
|
||||
generic_set_encrypted_ci_d_ops(dentry);
|
||||
if (err == -ENOENT)
|
||||
goto out_splice;
|
||||
if (err)
|
||||
@ -570,15 +589,17 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
|
||||
|
||||
trace_f2fs_unlink_enter(dir, dentry);
|
||||
|
||||
if (unlikely(f2fs_cp_error(sbi)))
|
||||
return -EIO;
|
||||
if (unlikely(f2fs_cp_error(sbi))) {
|
||||
err = -EIO;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
err = dquot_initialize(dir);
|
||||
err = f2fs_dquot_initialize(dir);
|
||||
if (err)
|
||||
return err;
|
||||
err = dquot_initialize(inode);
|
||||
goto fail;
|
||||
err = f2fs_dquot_initialize(inode);
|
||||
if (err)
|
||||
return err;
|
||||
goto fail;
|
||||
|
||||
de = f2fs_find_entry(dir, &dentry->d_name, &page);
|
||||
if (!de) {
|
||||
@ -601,7 +622,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
|
||||
/* VFS negative dentries are incompatible with Encoding and
|
||||
* Case-insensitiveness. Eventually we'll want avoid
|
||||
* invalidating the dentries here, alongside with returning the
|
||||
* negative dentries at f2fs_lookup(), when it is better
|
||||
* negative dentries at f2fs_lookup(), when it is better
|
||||
* supported by the VFS for the CI case.
|
||||
*/
|
||||
if (IS_CASEFOLDED(dir))
|
||||
@ -621,6 +642,7 @@ static const char *f2fs_get_link(struct dentry *dentry,
|
||||
struct delayed_call *done)
|
||||
{
|
||||
const char *link = page_get_link(dentry, inode, done);
|
||||
|
||||
if (!IS_ERR(link) && !*link) {
|
||||
/* this is broken symlink case */
|
||||
do_delayed_call(done);
|
||||
@ -649,7 +671,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = dquot_initialize(dir);
|
||||
err = f2fs_dquot_initialize(dir);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
@ -706,7 +728,7 @@ out_f2fs_handle_failed_inode:
|
||||
f2fs_handle_failed_inode(inode);
|
||||
out_free_encrypted_link:
|
||||
if (disk_link.name != (unsigned char *)symname)
|
||||
kvfree(disk_link.name);
|
||||
kfree(disk_link.name);
|
||||
return err;
|
||||
}
|
||||
|
||||
@ -719,7 +741,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
|
||||
if (unlikely(f2fs_cp_error(sbi)))
|
||||
return -EIO;
|
||||
|
||||
err = dquot_initialize(dir);
|
||||
err = f2fs_dquot_initialize(dir);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
@ -730,7 +752,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
|
||||
inode->i_op = &f2fs_dir_inode_operations;
|
||||
inode->i_fop = &f2fs_dir_operations;
|
||||
inode->i_mapping->a_ops = &f2fs_dblock_aops;
|
||||
inode_nohighmem(inode);
|
||||
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
|
||||
|
||||
set_inode_flag(inode, FI_INC_LINK);
|
||||
f2fs_lock_op(sbi);
|
||||
@ -758,6 +780,7 @@ out_fail:
|
||||
static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
struct inode *inode = d_inode(dentry);
|
||||
|
||||
if (f2fs_empty_dir(inode))
|
||||
return f2fs_unlink(dir, dentry);
|
||||
return -ENOTEMPTY;
|
||||
@ -775,7 +798,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
|
||||
if (!f2fs_is_checkpoint_ready(sbi))
|
||||
return -ENOSPC;
|
||||
|
||||
err = dquot_initialize(dir);
|
||||
err = f2fs_dquot_initialize(dir);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
@ -813,7 +836,7 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
|
||||
struct inode *inode;
|
||||
int err;
|
||||
|
||||
err = dquot_initialize(dir);
|
||||
err = f2fs_dquot_initialize(dir);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
@ -848,7 +871,11 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
|
||||
|
||||
if (whiteout) {
|
||||
f2fs_i_links_write(inode, false);
|
||||
|
||||
spin_lock(&inode->i_lock);
|
||||
inode->i_state |= I_LINKABLE;
|
||||
spin_unlock(&inode->i_lock);
|
||||
|
||||
*whiteout = inode;
|
||||
} else {
|
||||
d_tmpfile(dentry, inode);
|
||||
@ -932,16 +959,16 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
|
||||
return err;
|
||||
}
|
||||
|
||||
err = dquot_initialize(old_dir);
|
||||
err = f2fs_dquot_initialize(old_dir);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = dquot_initialize(new_dir);
|
||||
err = f2fs_dquot_initialize(new_dir);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
if (new_inode) {
|
||||
err = dquot_initialize(new_inode);
|
||||
err = f2fs_dquot_initialize(new_inode);
|
||||
if (err)
|
||||
goto out;
|
||||
}
|
||||
@ -990,11 +1017,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
|
||||
new_page = NULL;
|
||||
|
||||
new_inode->i_ctime = current_time(new_inode);
|
||||
down_write(&F2FS_I(new_inode)->i_sem);
|
||||
f2fs_down_write(&F2FS_I(new_inode)->i_sem);
|
||||
if (old_dir_entry)
|
||||
f2fs_i_links_write(new_inode, false);
|
||||
f2fs_i_links_write(new_inode, false);
|
||||
up_write(&F2FS_I(new_inode)->i_sem);
|
||||
f2fs_up_write(&F2FS_I(new_inode)->i_sem);
|
||||
|
||||
if (!new_inode->i_nlink)
|
||||
f2fs_add_orphan_inode(new_inode);
|
||||
@ -1015,13 +1042,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
|
||||
f2fs_i_links_write(new_dir, true);
|
||||
}
|
||||
|
||||
down_write(&F2FS_I(old_inode)->i_sem);
|
||||
f2fs_down_write(&F2FS_I(old_inode)->i_sem);
|
||||
if (!old_dir_entry || whiteout)
|
||||
file_lost_pino(old_inode);
|
||||
else
|
||||
/* adjust dir's i_pino to pass fsck check */
|
||||
f2fs_i_pino_write(old_inode, new_dir->i_ino);
|
||||
up_write(&F2FS_I(old_inode)->i_sem);
|
||||
f2fs_up_write(&F2FS_I(old_inode)->i_sem);
|
||||
|
||||
old_inode->i_ctime = current_time(old_inode);
|
||||
f2fs_mark_inode_dirty_sync(old_inode, false);
|
||||
@ -1034,7 +1061,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
|
||||
err = f2fs_add_link(old_dentry, whiteout);
|
||||
if (err)
|
||||
goto put_out_dir;
|
||||
|
||||
spin_lock(&whiteout->i_lock);
|
||||
whiteout->i_state &= ~I_LINKABLE;
|
||||
spin_unlock(&whiteout->i_lock);
|
||||
|
||||
iput(whiteout);
|
||||
}
|
||||
|
||||
@ -1070,8 +1101,7 @@ out_dir:
|
||||
out_old:
|
||||
f2fs_put_page(old_page, 0);
|
||||
out:
|
||||
if (whiteout)
|
||||
iput(whiteout);
|
||||
iput(whiteout);
|
||||
return err;
|
||||
}
|
||||
|
||||
@ -1101,11 +1131,11 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
|
||||
F2FS_I(new_dentry->d_inode)->i_projid)))
|
||||
return -EXDEV;
|
||||
|
||||
err = dquot_initialize(old_dir);
|
||||
err = f2fs_dquot_initialize(old_dir);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = dquot_initialize(new_dir);
|
||||
err = f2fs_dquot_initialize(new_dir);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
@ -1177,38 +1207,38 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
|
||||
/* update directory entry info of old dir inode */
|
||||
f2fs_set_link(old_dir, old_entry, old_page, new_inode);
|
||||
|
||||
down_write(&F2FS_I(old_inode)->i_sem);
|
||||
f2fs_down_write(&F2FS_I(old_inode)->i_sem);
|
||||
if (!old_dir_entry)
|
||||
file_lost_pino(old_inode);
|
||||
else
|
||||
/* adjust dir's i_pino to pass fsck check */
|
||||
f2fs_i_pino_write(old_inode, new_dir->i_ino);
|
||||
up_write(&F2FS_I(old_inode)->i_sem);
|
||||
f2fs_up_write(&F2FS_I(old_inode)->i_sem);
|
||||
|
||||
old_dir->i_ctime = current_time(old_dir);
|
||||
if (old_nlink) {
|
||||
down_write(&F2FS_I(old_dir)->i_sem);
|
||||
f2fs_down_write(&F2FS_I(old_dir)->i_sem);
|
||||
f2fs_i_links_write(old_dir, old_nlink > 0);
|
||||
up_write(&F2FS_I(old_dir)->i_sem);
|
||||
f2fs_up_write(&F2FS_I(old_dir)->i_sem);
|
||||
}
|
||||
f2fs_mark_inode_dirty_sync(old_dir, false);
|
||||
|
||||
/* update directory entry info of new dir inode */
|
||||
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
|
||||
|
||||
down_write(&F2FS_I(new_inode)->i_sem);
|
||||
f2fs_down_write(&F2FS_I(new_inode)->i_sem);
|
||||
if (!new_dir_entry)
|
||||
file_lost_pino(new_inode);
|
||||
else
|
||||
/* adjust dir's i_pino to pass fsck check */
|
||||
f2fs_i_pino_write(new_inode, old_dir->i_ino);
|
||||
up_write(&F2FS_I(new_inode)->i_sem);
|
||||
f2fs_up_write(&F2FS_I(new_inode)->i_sem);
|
||||
|
||||
new_dir->i_ctime = current_time(new_dir);
|
||||
if (new_nlink) {
|
||||
down_write(&F2FS_I(new_dir)->i_sem);
|
||||
f2fs_down_write(&F2FS_I(new_dir)->i_sem);
|
||||
f2fs_i_links_write(new_dir, new_nlink > 0);
|
||||
up_write(&F2FS_I(new_dir)->i_sem);
|
||||
f2fs_up_write(&F2FS_I(new_dir)->i_sem);
|
||||
}
|
||||
f2fs_mark_inode_dirty_sync(new_dir, false);
|
||||
|
||||
@ -1286,7 +1316,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
|
||||
}
|
||||
|
||||
const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
|
||||
.get_link = f2fs_encrypted_get_link,
|
||||
.get_link = f2fs_encrypted_get_link,
|
||||
.getattr = f2fs_getattr,
|
||||
.setattr = f2fs_setattr,
|
||||
.listxattr = f2fs_listxattr,
|
||||
@ -1312,7 +1342,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
|
||||
};
|
||||
|
||||
const struct inode_operations f2fs_symlink_inode_operations = {
|
||||
.get_link = f2fs_get_link,
|
||||
.get_link = f2fs_get_link,
|
||||
.getattr = f2fs_getattr,
|
||||
.setattr = f2fs_setattr,
|
||||
.listxattr = f2fs_listxattr,
|
||||
@ -1320,7 +1350,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
|
||||
|
||||
const struct inode_operations f2fs_special_inode_operations = {
|
||||
.getattr = f2fs_getattr,
|
||||
.setattr = f2fs_setattr,
|
||||
.setattr = f2fs_setattr,
|
||||
.get_acl = f2fs_get_acl,
|
||||
.set_acl = f2fs_set_acl,
|
||||
.listxattr = f2fs_listxattr,
|
||||
|
401
fs/f2fs/node.c
401
fs/f2fs/node.c
@ -17,7 +17,7 @@
|
||||
#include "node.h"
|
||||
#include "segment.h"
|
||||
#include "xattr.h"
|
||||
#include "trace.h"
|
||||
#include "iostat.h"
|
||||
#include <trace/events/f2fs.h>
|
||||
|
||||
#define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock)
|
||||
@ -44,11 +44,15 @@ int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
|
||||
bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
|
||||
{
|
||||
struct f2fs_nm_info *nm_i = NM_I(sbi);
|
||||
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
|
||||
struct sysinfo val;
|
||||
unsigned long avail_ram;
|
||||
unsigned long mem_size = 0;
|
||||
bool res = false;
|
||||
|
||||
if (!nm_i)
|
||||
return true;
|
||||
|
||||
si_meminfo(&val);
|
||||
|
||||
/* only uses low memory */
|
||||
@ -62,8 +66,8 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
|
||||
sizeof(struct free_nid)) >> PAGE_SHIFT;
|
||||
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
|
||||
} else if (type == NAT_ENTRIES) {
|
||||
mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >>
|
||||
PAGE_SHIFT;
|
||||
mem_size = (nm_i->nat_cnt[TOTAL_NAT] *
|
||||
sizeof(struct nat_entry)) >> PAGE_SHIFT;
|
||||
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
|
||||
if (excess_cached_nats(sbi))
|
||||
res = false;
|
||||
@ -90,6 +94,24 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
|
||||
/* it allows 20% / total_ram for inmemory pages */
|
||||
mem_size = get_pages(sbi, F2FS_INMEM_PAGES);
|
||||
res = mem_size < (val.totalram / 5);
|
||||
} else if (type == DISCARD_CACHE) {
|
||||
mem_size = (atomic_read(&dcc->discard_cmd_cnt) *
|
||||
sizeof(struct discard_cmd)) >> PAGE_SHIFT;
|
||||
res = mem_size < (avail_ram * nm_i->ram_thresh / 100);
|
||||
} else if (type == COMPRESS_PAGE) {
|
||||
#ifdef CONFIG_F2FS_FS_COMPRESSION
|
||||
unsigned long free_ram = val.freeram;
|
||||
|
||||
/*
|
||||
* free memory is lower than watermark or cached page count
|
||||
* exceed threshold, deny caching compress page.
|
||||
*/
|
||||
res = (free_ram > avail_ram * sbi->compress_watermark / 100) &&
|
||||
(COMPRESS_MAPPING(sbi)->nrpages <
|
||||
free_ram * sbi->compress_percent / 100);
|
||||
#else
|
||||
res = false;
|
||||
#endif
|
||||
} else {
|
||||
if (!sbi->sb->s_bdi->wb.dirty_exceeded)
|
||||
return true;
|
||||
@ -109,7 +131,7 @@ static void clear_node_page_dirty(struct page *page)
|
||||
|
||||
static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
|
||||
{
|
||||
return f2fs_get_meta_page_nofail(sbi, current_nat_addr(sbi, nid));
|
||||
return f2fs_get_meta_page_retry(sbi, current_nat_addr(sbi, nid));
|
||||
}
|
||||
|
||||
static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
|
||||
@ -141,14 +163,13 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
|
||||
return dst_page;
|
||||
}
|
||||
|
||||
static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail)
|
||||
static struct nat_entry *__alloc_nat_entry(struct f2fs_sb_info *sbi,
|
||||
nid_t nid, bool no_fail)
|
||||
{
|
||||
struct nat_entry *new;
|
||||
|
||||
if (no_fail)
|
||||
new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO);
|
||||
else
|
||||
new = kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO);
|
||||
new = f2fs_kmem_cache_alloc(nat_entry_slab,
|
||||
GFP_F2FS_ZERO, no_fail, sbi);
|
||||
if (new) {
|
||||
nat_set_nid(new, nid);
|
||||
nat_reset_flag(new);
|
||||
@ -177,7 +198,8 @@ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i,
|
||||
list_add_tail(&ne->list, &nm_i->nat_entries);
|
||||
spin_unlock(&nm_i->nat_list_lock);
|
||||
|
||||
nm_i->nat_cnt++;
|
||||
nm_i->nat_cnt[TOTAL_NAT]++;
|
||||
nm_i->nat_cnt[RECLAIMABLE_NAT]++;
|
||||
return ne;
|
||||
}
|
||||
|
||||
@ -207,7 +229,8 @@ static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i,
|
||||
static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
|
||||
{
|
||||
radix_tree_delete(&nm_i->nat_root, nat_get_nid(e));
|
||||
nm_i->nat_cnt--;
|
||||
nm_i->nat_cnt[TOTAL_NAT]--;
|
||||
nm_i->nat_cnt[RECLAIMABLE_NAT]--;
|
||||
__free_nat_entry(e);
|
||||
}
|
||||
|
||||
@ -219,7 +242,8 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
|
||||
|
||||
head = radix_tree_lookup(&nm_i->nat_set_root, set);
|
||||
if (!head) {
|
||||
head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS);
|
||||
head = f2fs_kmem_cache_alloc(nat_entry_set_slab,
|
||||
GFP_NOFS, true, NULL);
|
||||
|
||||
INIT_LIST_HEAD(&head->entry_list);
|
||||
INIT_LIST_HEAD(&head->set_list);
|
||||
@ -253,7 +277,8 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
|
||||
if (get_nat_flag(ne, IS_DIRTY))
|
||||
goto refresh_list;
|
||||
|
||||
nm_i->dirty_nat_cnt++;
|
||||
nm_i->nat_cnt[DIRTY_NAT]++;
|
||||
nm_i->nat_cnt[RECLAIMABLE_NAT]--;
|
||||
set_nat_flag(ne, IS_DIRTY, true);
|
||||
refresh_list:
|
||||
spin_lock(&nm_i->nat_list_lock);
|
||||
@ -273,7 +298,8 @@ static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
|
||||
|
||||
set_nat_flag(ne, IS_DIRTY, false);
|
||||
set->entry_cnt--;
|
||||
nm_i->dirty_nat_cnt--;
|
||||
nm_i->nat_cnt[DIRTY_NAT]--;
|
||||
nm_i->nat_cnt[RECLAIMABLE_NAT]++;
|
||||
}
|
||||
|
||||
static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
|
||||
@ -304,7 +330,8 @@ static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi,
|
||||
unsigned long flags;
|
||||
unsigned int seq_id;
|
||||
|
||||
fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, GFP_NOFS);
|
||||
fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab,
|
||||
GFP_NOFS, true, NULL);
|
||||
|
||||
get_page(page);
|
||||
fn->page = page;
|
||||
@ -355,14 +382,14 @@ int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
|
||||
struct nat_entry *e;
|
||||
bool need = false;
|
||||
|
||||
down_read(&nm_i->nat_tree_lock);
|
||||
f2fs_down_read(&nm_i->nat_tree_lock);
|
||||
e = __lookup_nat_cache(nm_i, nid);
|
||||
if (e) {
|
||||
if (!get_nat_flag(e, IS_CHECKPOINTED) &&
|
||||
!get_nat_flag(e, HAS_FSYNCED_INODE))
|
||||
need = true;
|
||||
}
|
||||
up_read(&nm_i->nat_tree_lock);
|
||||
f2fs_up_read(&nm_i->nat_tree_lock);
|
||||
return need;
|
||||
}
|
||||
|
||||
@ -372,11 +399,11 @@ bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
|
||||
struct nat_entry *e;
|
||||
bool is_cp = true;
|
||||
|
||||
down_read(&nm_i->nat_tree_lock);
|
||||
f2fs_down_read(&nm_i->nat_tree_lock);
|
||||
e = __lookup_nat_cache(nm_i, nid);
|
||||
if (e && !get_nat_flag(e, IS_CHECKPOINTED))
|
||||
is_cp = false;
|
||||
up_read(&nm_i->nat_tree_lock);
|
||||
f2fs_up_read(&nm_i->nat_tree_lock);
|
||||
return is_cp;
|
||||
}
|
||||
|
||||
@ -386,13 +413,13 @@ bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
|
||||
struct nat_entry *e;
|
||||
bool need_update = true;
|
||||
|
||||
down_read(&nm_i->nat_tree_lock);
|
||||
f2fs_down_read(&nm_i->nat_tree_lock);
|
||||
e = __lookup_nat_cache(nm_i, ino);
|
||||
if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
|
||||
(get_nat_flag(e, IS_CHECKPOINTED) ||
|
||||
get_nat_flag(e, HAS_FSYNCED_INODE)))
|
||||
need_update = false;
|
||||
up_read(&nm_i->nat_tree_lock);
|
||||
f2fs_up_read(&nm_i->nat_tree_lock);
|
||||
return need_update;
|
||||
}
|
||||
|
||||
@ -403,11 +430,15 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
|
||||
struct f2fs_nm_info *nm_i = NM_I(sbi);
|
||||
struct nat_entry *new, *e;
|
||||
|
||||
new = __alloc_nat_entry(nid, false);
|
||||
/* Let's mitigate lock contention of nat_tree_lock during checkpoint */
|
||||
if (f2fs_rwsem_is_locked(&sbi->cp_global_sem))
|
||||
return;
|
||||
|
||||
new = __alloc_nat_entry(sbi, nid, false);
|
||||
if (!new)
|
||||
return;
|
||||
|
||||
down_write(&nm_i->nat_tree_lock);
|
||||
f2fs_down_write(&nm_i->nat_tree_lock);
|
||||
e = __lookup_nat_cache(nm_i, nid);
|
||||
if (!e)
|
||||
e = __init_nat_entry(nm_i, new, ne, false);
|
||||
@ -416,7 +447,7 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
|
||||
nat_get_blkaddr(e) !=
|
||||
le32_to_cpu(ne->block_addr) ||
|
||||
nat_get_version(e) != ne->version);
|
||||
up_write(&nm_i->nat_tree_lock);
|
||||
f2fs_up_write(&nm_i->nat_tree_lock);
|
||||
if (e != new)
|
||||
__free_nat_entry(new);
|
||||
}
|
||||
@ -426,9 +457,9 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
|
||||
{
|
||||
struct f2fs_nm_info *nm_i = NM_I(sbi);
|
||||
struct nat_entry *e;
|
||||
struct nat_entry *new = __alloc_nat_entry(ni->nid, true);
|
||||
struct nat_entry *new = __alloc_nat_entry(sbi, ni->nid, true);
|
||||
|
||||
down_write(&nm_i->nat_tree_lock);
|
||||
f2fs_down_write(&nm_i->nat_tree_lock);
|
||||
e = __lookup_nat_cache(nm_i, ni->nid);
|
||||
if (!e) {
|
||||
e = __init_nat_entry(nm_i, new, NULL, true);
|
||||
@ -459,6 +490,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
|
||||
/* increment version no as node is removed */
|
||||
if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
|
||||
unsigned char version = nat_get_version(e);
|
||||
|
||||
nat_set_version(e, inc_node_version(version));
|
||||
}
|
||||
|
||||
@ -476,7 +508,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
|
||||
set_nat_flag(e, HAS_FSYNCED_INODE, true);
|
||||
set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
|
||||
}
|
||||
up_write(&nm_i->nat_tree_lock);
|
||||
f2fs_up_write(&nm_i->nat_tree_lock);
|
||||
}
|
||||
|
||||
int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
|
||||
@ -484,7 +516,7 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
|
||||
struct f2fs_nm_info *nm_i = NM_I(sbi);
|
||||
int nr = nr_shrink;
|
||||
|
||||
if (!down_write_trylock(&nm_i->nat_tree_lock))
|
||||
if (!f2fs_down_write_trylock(&nm_i->nat_tree_lock))
|
||||
return 0;
|
||||
|
||||
spin_lock(&nm_i->nat_list_lock);
|
||||
@ -506,12 +538,12 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
|
||||
}
|
||||
spin_unlock(&nm_i->nat_list_lock);
|
||||
|
||||
up_write(&nm_i->nat_tree_lock);
|
||||
f2fs_up_write(&nm_i->nat_tree_lock);
|
||||
return nr - nr_shrink;
|
||||
}
|
||||
|
||||
int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
|
||||
struct node_info *ni)
|
||||
struct node_info *ni, bool checkpoint_context)
|
||||
{
|
||||
struct f2fs_nm_info *nm_i = NM_I(sbi);
|
||||
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
|
||||
@ -526,36 +558,46 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
|
||||
int i;
|
||||
|
||||
ni->nid = nid;
|
||||
|
||||
retry:
|
||||
/* Check nat cache */
|
||||
down_read(&nm_i->nat_tree_lock);
|
||||
f2fs_down_read(&nm_i->nat_tree_lock);
|
||||
e = __lookup_nat_cache(nm_i, nid);
|
||||
if (e) {
|
||||
ni->ino = nat_get_ino(e);
|
||||
ni->blk_addr = nat_get_blkaddr(e);
|
||||
ni->version = nat_get_version(e);
|
||||
up_read(&nm_i->nat_tree_lock);
|
||||
f2fs_up_read(&nm_i->nat_tree_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
memset(&ne, 0, sizeof(struct f2fs_nat_entry));
|
||||
/*
|
||||
* Check current segment summary by trying to grab journal_rwsem first.
|
||||
* This sem is on the critical path on the checkpoint requiring the above
|
||||
* nat_tree_lock. Therefore, we should retry, if we failed to grab here
|
||||
* while not bothering checkpoint.
|
||||
*/
|
||||
if (!f2fs_rwsem_is_locked(&sbi->cp_global_sem) || checkpoint_context) {
|
||||
down_read(&curseg->journal_rwsem);
|
||||
} else if (f2fs_rwsem_is_contended(&nm_i->nat_tree_lock) ||
|
||||
!down_read_trylock(&curseg->journal_rwsem)) {
|
||||
f2fs_up_read(&nm_i->nat_tree_lock);
|
||||
goto retry;
|
||||
}
|
||||
|
||||
/* Check current segment summary */
|
||||
down_read(&curseg->journal_rwsem);
|
||||
i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
|
||||
if (i >= 0) {
|
||||
ne = nat_in_journal(journal, i);
|
||||
node_info_from_raw_nat(ni, &ne);
|
||||
}
|
||||
up_read(&curseg->journal_rwsem);
|
||||
up_read(&curseg->journal_rwsem);
|
||||
if (i >= 0) {
|
||||
up_read(&nm_i->nat_tree_lock);
|
||||
f2fs_up_read(&nm_i->nat_tree_lock);
|
||||
goto cache;
|
||||
}
|
||||
|
||||
/* Fill node_info from nat page */
|
||||
index = current_nat_addr(sbi, nid);
|
||||
up_read(&nm_i->nat_tree_lock);
|
||||
f2fs_up_read(&nm_i->nat_tree_lock);
|
||||
|
||||
page = f2fs_get_meta_page(sbi, index);
|
||||
if (IS_ERR(page))
|
||||
@ -804,6 +846,26 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
|
||||
dn->ofs_in_node = offset[level];
|
||||
dn->node_page = npage[level];
|
||||
dn->data_blkaddr = f2fs_data_blkaddr(dn);
|
||||
|
||||
if (is_inode_flag_set(dn->inode, FI_COMPRESSED_FILE) &&
|
||||
f2fs_sb_has_readonly(sbi)) {
|
||||
unsigned int c_len = f2fs_cluster_blocks_are_contiguous(dn);
|
||||
block_t blkaddr;
|
||||
|
||||
if (!c_len)
|
||||
goto out;
|
||||
|
||||
blkaddr = f2fs_data_blkaddr(dn);
|
||||
if (blkaddr == COMPRESS_ADDR)
|
||||
blkaddr = data_blkaddr(dn->inode, dn->node_page,
|
||||
dn->ofs_in_node + 1);
|
||||
|
||||
f2fs_update_extent_tree_range_compressed(dn->inode,
|
||||
index, blkaddr,
|
||||
F2FS_I(dn->inode)->i_cluster_size,
|
||||
c_len);
|
||||
}
|
||||
out:
|
||||
return 0;
|
||||
|
||||
release_pages:
|
||||
@ -828,7 +890,7 @@ static int truncate_node(struct dnode_of_data *dn)
|
||||
int err;
|
||||
pgoff_t index;
|
||||
|
||||
err = f2fs_get_node_info(sbi, dn->nid, &ni);
|
||||
err = f2fs_get_node_info(sbi, dn->nid, &ni, false);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
@ -1039,8 +1101,10 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from)
|
||||
trace_f2fs_truncate_inode_blocks_enter(inode, from);
|
||||
|
||||
level = get_node_path(inode, from, offset, noffset);
|
||||
if (level < 0)
|
||||
if (level < 0) {
|
||||
trace_f2fs_truncate_inode_blocks_exit(inode, level);
|
||||
return level;
|
||||
}
|
||||
|
||||
page = f2fs_get_node_page(sbi, inode->i_ino);
|
||||
if (IS_ERR(page)) {
|
||||
@ -1225,7 +1289,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs)
|
||||
goto fail;
|
||||
|
||||
#ifdef CONFIG_F2FS_CHECK_FS
|
||||
err = f2fs_get_node_info(sbi, dn->nid, &new_ni);
|
||||
err = f2fs_get_node_info(sbi, dn->nid, &new_ni, false);
|
||||
if (err) {
|
||||
dec_valid_node_count(sbi, dn->inode, !ofs);
|
||||
goto fail;
|
||||
@ -1287,11 +1351,12 @@ static int read_node_page(struct page *page, int op_flags)
|
||||
return LOCKED_PAGE;
|
||||
}
|
||||
|
||||
err = f2fs_get_node_info(sbi, page->index, &ni);
|
||||
err = f2fs_get_node_info(sbi, page->index, &ni, false);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (unlikely(ni.blk_addr == NULL_ADDR) ||
|
||||
/* NEW_ADDR can be seen, after cp_error drops some dirty node pages */
|
||||
if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR) ||
|
||||
is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) {
|
||||
ClearPageUptodate(page);
|
||||
return -ENOENT;
|
||||
@ -1378,11 +1443,12 @@ repeat:
|
||||
goto out_err;
|
||||
}
|
||||
page_hit:
|
||||
if(unlikely(nid != nid_of_node(page))) {
|
||||
if (unlikely(nid != nid_of_node(page))) {
|
||||
f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
|
||||
nid, nid_of_node(page), ino_of_node(page),
|
||||
ofs_of_node(page), cpver_of_node(page),
|
||||
next_blkaddr_of_node(page));
|
||||
set_sbi_flag(sbi, SBI_NEED_FSCK);
|
||||
err = -EINVAL;
|
||||
out_err:
|
||||
ClearPageUptodate(page);
|
||||
@ -1521,13 +1587,10 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
|
||||
trace_f2fs_writepage(page, NODE);
|
||||
|
||||
if (unlikely(f2fs_cp_error(sbi))) {
|
||||
if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) {
|
||||
ClearPageUptodate(page);
|
||||
dec_page_count(sbi, F2FS_DIRTY_NODES);
|
||||
unlock_page(page);
|
||||
return 0;
|
||||
}
|
||||
goto redirty_out;
|
||||
ClearPageUptodate(page);
|
||||
dec_page_count(sbi, F2FS_DIRTY_NODES);
|
||||
unlock_page(page);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
|
||||
@ -1542,21 +1605,21 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
|
||||
nid = nid_of_node(page);
|
||||
f2fs_bug_on(sbi, page->index != nid);
|
||||
|
||||
if (f2fs_get_node_info(sbi, nid, &ni))
|
||||
if (f2fs_get_node_info(sbi, nid, &ni, !do_balance))
|
||||
goto redirty_out;
|
||||
|
||||
if (wbc->for_reclaim) {
|
||||
if (!down_read_trylock(&sbi->node_write))
|
||||
if (!f2fs_down_read_trylock(&sbi->node_write))
|
||||
goto redirty_out;
|
||||
} else {
|
||||
down_read(&sbi->node_write);
|
||||
f2fs_down_read(&sbi->node_write);
|
||||
}
|
||||
|
||||
/* This page is already truncated */
|
||||
if (unlikely(ni.blk_addr == NULL_ADDR)) {
|
||||
ClearPageUptodate(page);
|
||||
dec_page_count(sbi, F2FS_DIRTY_NODES);
|
||||
up_read(&sbi->node_write);
|
||||
f2fs_up_read(&sbi->node_write);
|
||||
unlock_page(page);
|
||||
return 0;
|
||||
}
|
||||
@ -1564,7 +1627,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
|
||||
if (__is_valid_data_blkaddr(ni.blk_addr) &&
|
||||
!f2fs_is_valid_blkaddr(sbi, ni.blk_addr,
|
||||
DATA_GENERIC_ENHANCE)) {
|
||||
up_read(&sbi->node_write);
|
||||
f2fs_up_read(&sbi->node_write);
|
||||
goto redirty_out;
|
||||
}
|
||||
|
||||
@ -1585,7 +1648,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
|
||||
f2fs_do_write_node_page(nid, &fio);
|
||||
set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
|
||||
dec_page_count(sbi, F2FS_DIRTY_NODES);
|
||||
up_read(&sbi->node_write);
|
||||
f2fs_up_read(&sbi->node_write);
|
||||
|
||||
if (wbc->for_reclaim) {
|
||||
f2fs_submit_merged_write_cond(sbi, NULL, page, 0, NODE);
|
||||
@ -1719,6 +1782,7 @@ continue_unlock:
|
||||
|
||||
if (!atomic || page == last_page) {
|
||||
set_fsync_mark(page, 1);
|
||||
percpu_counter_inc(&sbi->rf_node_block_count);
|
||||
if (IS_INODE(page)) {
|
||||
if (is_inode_flag_set(inode,
|
||||
FI_DIRTY_INODE))
|
||||
@ -1726,7 +1790,7 @@ continue_unlock:
|
||||
set_dentry_mark(page,
|
||||
f2fs_need_dentry_mark(sbi, ino));
|
||||
}
|
||||
/* may be written by other thread */
|
||||
/* may be written by other thread */
|
||||
if (!PageDirty(page))
|
||||
set_page_dirty(page);
|
||||
}
|
||||
@ -1770,7 +1834,7 @@ continue_unlock:
|
||||
out:
|
||||
if (nwritten)
|
||||
f2fs_submit_merged_write_cond(sbi, NULL, NULL, ino, NODE);
|
||||
return ret ? -EIO: 0;
|
||||
return ret ? -EIO : 0;
|
||||
}
|
||||
|
||||
static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data)
|
||||
@ -1814,12 +1878,11 @@ static bool flush_dirty_inode(struct page *page)
|
||||
return true;
|
||||
}
|
||||
|
||||
int f2fs_flush_inline_data(struct f2fs_sb_info *sbi)
|
||||
void f2fs_flush_inline_data(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
pgoff_t index = 0;
|
||||
struct pagevec pvec;
|
||||
int nr_pages;
|
||||
int ret = 0;
|
||||
|
||||
pagevec_init(&pvec);
|
||||
|
||||
@ -1847,8 +1910,8 @@ continue_unlock:
|
||||
}
|
||||
|
||||
/* flush inline_data, if it's async context. */
|
||||
if (is_inline_node(page)) {
|
||||
clear_inline_node(page);
|
||||
if (page_private_inline(page)) {
|
||||
clear_page_private_inline(page);
|
||||
unlock_page(page);
|
||||
flush_inline_data(sbi, ino_of_node(page));
|
||||
continue;
|
||||
@ -1858,7 +1921,6 @@ continue_unlock:
|
||||
pagevec_release(&pvec);
|
||||
cond_resched();
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
|
||||
@ -1924,9 +1986,13 @@ continue_unlock:
|
||||
goto continue_unlock;
|
||||
}
|
||||
|
||||
/* flush inline_data, if it's async context. */
|
||||
if (do_balance && is_inline_node(page)) {
|
||||
clear_inline_node(page);
|
||||
/* flush inline_data/inode, if it's async context. */
|
||||
if (!do_balance)
|
||||
goto write_node;
|
||||
|
||||
/* flush inline_data */
|
||||
if (page_private_inline(page)) {
|
||||
clear_page_private_inline(page);
|
||||
unlock_page(page);
|
||||
flush_inline_data(sbi, ino_of_node(page));
|
||||
goto lock_node;
|
||||
@ -1938,7 +2004,7 @@ continue_unlock:
|
||||
if (flush_dirty_inode(page))
|
||||
goto lock_node;
|
||||
}
|
||||
|
||||
write_node:
|
||||
f2fs_wait_on_page_writeback(page, NODE, true, true);
|
||||
|
||||
if (!clear_page_dirty_for_io(page))
|
||||
@ -2046,8 +2112,12 @@ static int f2fs_write_node_pages(struct address_space *mapping,
|
||||
|
||||
if (wbc->sync_mode == WB_SYNC_ALL)
|
||||
atomic_inc(&sbi->wb_sync_req[NODE]);
|
||||
else if (atomic_read(&sbi->wb_sync_req[NODE]))
|
||||
else if (atomic_read(&sbi->wb_sync_req[NODE])) {
|
||||
/* to avoid potential deadlock */
|
||||
if (current->plug)
|
||||
blk_finish_plug(current->plug);
|
||||
goto skip_write;
|
||||
}
|
||||
|
||||
trace_f2fs_writepages(mapping->host, wbc, NODE);
|
||||
|
||||
@ -2080,8 +2150,7 @@ static int f2fs_set_node_page_dirty(struct page *page)
|
||||
if (!PageDirty(page)) {
|
||||
__set_page_dirty_nobuffers(page);
|
||||
inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
|
||||
f2fs_set_page_private(page, 0);
|
||||
f2fs_trace_pid(page);
|
||||
set_page_private_reference(page);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
@ -2097,7 +2166,7 @@ const struct address_space_operations f2fs_node_aops = {
|
||||
.invalidatepage = f2fs_invalidate_page,
|
||||
.releasepage = f2fs_release_page,
|
||||
#ifdef CONFIG_MIGRATION
|
||||
.migratepage = f2fs_migrate_page,
|
||||
.migratepage = f2fs_migrate_page,
|
||||
#endif
|
||||
};
|
||||
|
||||
@ -2108,18 +2177,16 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
|
||||
}
|
||||
|
||||
static int __insert_free_nid(struct f2fs_sb_info *sbi,
|
||||
struct free_nid *i, enum nid_state state)
|
||||
struct free_nid *i)
|
||||
{
|
||||
struct f2fs_nm_info *nm_i = NM_I(sbi);
|
||||
|
||||
int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i);
|
||||
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
f2fs_bug_on(sbi, state != i->state);
|
||||
nm_i->nid_cnt[state]++;
|
||||
if (state == FREE_NID)
|
||||
list_add_tail(&i->list, &nm_i->free_nid_list);
|
||||
nm_i->nid_cnt[FREE_NID]++;
|
||||
list_add_tail(&i->list, &nm_i->free_nid_list);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -2157,6 +2224,24 @@ static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i,
|
||||
}
|
||||
}
|
||||
|
||||
bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
struct f2fs_nm_info *nm_i = NM_I(sbi);
|
||||
unsigned int i;
|
||||
bool ret = true;
|
||||
|
||||
f2fs_down_read(&nm_i->nat_tree_lock);
|
||||
for (i = 0; i < nm_i->nat_blocks; i++) {
|
||||
if (!test_bit_le(i, nm_i->nat_block_bitmap)) {
|
||||
ret = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
f2fs_up_read(&nm_i->nat_tree_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
|
||||
bool set, bool build)
|
||||
{
|
||||
@ -2198,7 +2283,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi,
|
||||
if (unlikely(f2fs_check_nid_range(sbi, nid)))
|
||||
return false;
|
||||
|
||||
i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS);
|
||||
i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS, true, NULL);
|
||||
i->nid = nid;
|
||||
i->state = FREE_NID;
|
||||
|
||||
@ -2241,7 +2326,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi,
|
||||
}
|
||||
}
|
||||
ret = true;
|
||||
err = __insert_free_nid(sbi, i, FREE_NID);
|
||||
err = __insert_free_nid(sbi, i);
|
||||
err_out:
|
||||
if (update) {
|
||||
update_free_nid_bitmap(sbi, nid, ret, build);
|
||||
@ -2335,7 +2420,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
|
||||
unsigned int i, idx;
|
||||
nid_t nid;
|
||||
|
||||
down_read(&nm_i->nat_tree_lock);
|
||||
f2fs_down_read(&nm_i->nat_tree_lock);
|
||||
|
||||
for (i = 0; i < nm_i->nat_blocks; i++) {
|
||||
if (!test_bit_le(i, nm_i->nat_block_bitmap))
|
||||
@ -2358,7 +2443,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
|
||||
out:
|
||||
scan_curseg_cache(sbi);
|
||||
|
||||
up_read(&nm_i->nat_tree_lock);
|
||||
f2fs_up_read(&nm_i->nat_tree_lock);
|
||||
}
|
||||
|
||||
static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
|
||||
@ -2393,7 +2478,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
|
||||
f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
|
||||
META_NAT, true);
|
||||
|
||||
down_read(&nm_i->nat_tree_lock);
|
||||
f2fs_down_read(&nm_i->nat_tree_lock);
|
||||
|
||||
while (1) {
|
||||
if (!test_bit_le(NAT_BLOCK_OFFSET(nid),
|
||||
@ -2408,7 +2493,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
|
||||
}
|
||||
|
||||
if (ret) {
|
||||
up_read(&nm_i->nat_tree_lock);
|
||||
f2fs_up_read(&nm_i->nat_tree_lock);
|
||||
f2fs_err(sbi, "NAT is corrupt, run fsck to fix it");
|
||||
return ret;
|
||||
}
|
||||
@ -2428,7 +2513,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
|
||||
/* find free nids from current sum_pages */
|
||||
scan_curseg_cache(sbi);
|
||||
|
||||
up_read(&nm_i->nat_tree_lock);
|
||||
f2fs_up_read(&nm_i->nat_tree_lock);
|
||||
|
||||
f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
|
||||
nm_i->ra_nid_pages, META_NAT, false);
|
||||
@ -2575,7 +2660,7 @@ int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
|
||||
return nr - nr_shrink;
|
||||
}
|
||||
|
||||
void f2fs_recover_inline_xattr(struct inode *inode, struct page *page)
|
||||
int f2fs_recover_inline_xattr(struct inode *inode, struct page *page)
|
||||
{
|
||||
void *src_addr, *dst_addr;
|
||||
size_t inline_size;
|
||||
@ -2583,13 +2668,20 @@ void f2fs_recover_inline_xattr(struct inode *inode, struct page *page)
|
||||
struct f2fs_inode *ri;
|
||||
|
||||
ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
|
||||
f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage));
|
||||
if (IS_ERR(ipage))
|
||||
return PTR_ERR(ipage);
|
||||
|
||||
ri = F2FS_INODE(page);
|
||||
if (ri->i_inline & F2FS_INLINE_XATTR) {
|
||||
set_inode_flag(inode, FI_INLINE_XATTR);
|
||||
if (!f2fs_has_inline_xattr(inode)) {
|
||||
set_inode_flag(inode, FI_INLINE_XATTR);
|
||||
stat_inc_inline_xattr(inode);
|
||||
}
|
||||
} else {
|
||||
clear_inode_flag(inode, FI_INLINE_XATTR);
|
||||
if (f2fs_has_inline_xattr(inode)) {
|
||||
stat_dec_inline_xattr(inode);
|
||||
clear_inode_flag(inode, FI_INLINE_XATTR);
|
||||
}
|
||||
goto update_inode;
|
||||
}
|
||||
|
||||
@ -2602,6 +2694,7 @@ void f2fs_recover_inline_xattr(struct inode *inode, struct page *page)
|
||||
update_inode:
|
||||
f2fs_update_inode(inode, ipage);
|
||||
f2fs_put_page(ipage, 1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int f2fs_recover_xattr_data(struct inode *inode, struct page *page)
|
||||
@ -2618,7 +2711,7 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page)
|
||||
goto recover_xnid;
|
||||
|
||||
/* 1: invalidate the previous xattr nid */
|
||||
err = f2fs_get_node_info(sbi, prev_xnid, &ni);
|
||||
err = f2fs_get_node_info(sbi, prev_xnid, &ni, false);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
@ -2658,7 +2751,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
|
||||
struct page *ipage;
|
||||
int err;
|
||||
|
||||
err = f2fs_get_node_info(sbi, ino, &old_ni);
|
||||
err = f2fs_get_node_info(sbi, ino, &old_ni, false);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
@ -2682,7 +2775,7 @@ retry:
|
||||
src = F2FS_INODE(page);
|
||||
dst = F2FS_INODE(ipage);
|
||||
|
||||
memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src);
|
||||
memcpy(dst, src, offsetof(struct f2fs_inode, i_ext));
|
||||
dst->i_size = 0;
|
||||
dst->i_blocks = cpu_to_le64(1);
|
||||
dst->i_links = cpu_to_le32(1);
|
||||
@ -2773,11 +2866,14 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
|
||||
struct f2fs_nat_entry raw_ne;
|
||||
nid_t nid = le32_to_cpu(nid_in_journal(journal, i));
|
||||
|
||||
if (f2fs_check_nid_range(sbi, nid))
|
||||
continue;
|
||||
|
||||
raw_ne = nat_in_journal(journal, i);
|
||||
|
||||
ne = __lookup_nat_cache(nm_i, nid);
|
||||
if (!ne) {
|
||||
ne = __alloc_nat_entry(nid, true);
|
||||
ne = __alloc_nat_entry(sbi, nid, true);
|
||||
__init_nat_entry(nm_i, ne, &raw_ne, true);
|
||||
}
|
||||
|
||||
@ -2817,7 +2913,23 @@ add_out:
|
||||
list_add_tail(&nes->set_list, head);
|
||||
}
|
||||
|
||||
static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
|
||||
static void __update_nat_bits(struct f2fs_nm_info *nm_i, unsigned int nat_ofs,
|
||||
unsigned int valid)
|
||||
{
|
||||
if (valid == 0) {
|
||||
__set_bit_le(nat_ofs, nm_i->empty_nat_bits);
|
||||
__clear_bit_le(nat_ofs, nm_i->full_nat_bits);
|
||||
return;
|
||||
}
|
||||
|
||||
__clear_bit_le(nat_ofs, nm_i->empty_nat_bits);
|
||||
if (valid == NAT_ENTRY_PER_BLOCK)
|
||||
__set_bit_le(nat_ofs, nm_i->full_nat_bits);
|
||||
else
|
||||
__clear_bit_le(nat_ofs, nm_i->full_nat_bits);
|
||||
}
|
||||
|
||||
static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
|
||||
struct page *page)
|
||||
{
|
||||
struct f2fs_nm_info *nm_i = NM_I(sbi);
|
||||
@ -2826,7 +2938,7 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
|
||||
int valid = 0;
|
||||
int i = 0;
|
||||
|
||||
if (!enabled_nat_bits(sbi, NULL))
|
||||
if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
|
||||
return;
|
||||
|
||||
if (nat_index == 0) {
|
||||
@ -2837,17 +2949,36 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
|
||||
if (le32_to_cpu(nat_blk->entries[i].block_addr) != NULL_ADDR)
|
||||
valid++;
|
||||
}
|
||||
if (valid == 0) {
|
||||
__set_bit_le(nat_index, nm_i->empty_nat_bits);
|
||||
__clear_bit_le(nat_index, nm_i->full_nat_bits);
|
||||
return;
|
||||
|
||||
__update_nat_bits(nm_i, nat_index, valid);
|
||||
}
|
||||
|
||||
void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
struct f2fs_nm_info *nm_i = NM_I(sbi);
|
||||
unsigned int nat_ofs;
|
||||
|
||||
f2fs_down_read(&nm_i->nat_tree_lock);
|
||||
|
||||
for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) {
|
||||
unsigned int valid = 0, nid_ofs = 0;
|
||||
|
||||
/* handle nid zero due to it should never be used */
|
||||
if (unlikely(nat_ofs == 0)) {
|
||||
valid = 1;
|
||||
nid_ofs = 1;
|
||||
}
|
||||
|
||||
for (; nid_ofs < NAT_ENTRY_PER_BLOCK; nid_ofs++) {
|
||||
if (!test_bit_le(nid_ofs,
|
||||
nm_i->free_nid_bitmap[nat_ofs]))
|
||||
valid++;
|
||||
}
|
||||
|
||||
__update_nat_bits(nm_i, nat_ofs, valid);
|
||||
}
|
||||
|
||||
__clear_bit_le(nat_index, nm_i->empty_nat_bits);
|
||||
if (valid == NAT_ENTRY_PER_BLOCK)
|
||||
__set_bit_le(nat_index, nm_i->full_nat_bits);
|
||||
else
|
||||
__clear_bit_le(nat_index, nm_i->full_nat_bits);
|
||||
f2fs_up_read(&nm_i->nat_tree_lock);
|
||||
}
|
||||
|
||||
static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
|
||||
@ -2866,7 +2997,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
|
||||
* #1, flush nat entries to journal in current hot data summary block.
|
||||
* #2, flush nat entries to nat page.
|
||||
*/
|
||||
if (enabled_nat_bits(sbi, cpc) ||
|
||||
if ((cpc->reason & CP_UMOUNT) ||
|
||||
!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
|
||||
to_journal = false;
|
||||
|
||||
@ -2913,7 +3044,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
|
||||
if (to_journal) {
|
||||
up_write(&curseg->journal_rwsem);
|
||||
} else {
|
||||
__update_nat_bits(sbi, start_nid, page);
|
||||
update_nat_bits(sbi, start_nid, page);
|
||||
f2fs_put_page(page, 1);
|
||||
}
|
||||
|
||||
@ -2940,30 +3071,35 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
|
||||
LIST_HEAD(sets);
|
||||
int err = 0;
|
||||
|
||||
/* during unmount, let's flush nat_bits before checking dirty_nat_cnt */
|
||||
if (enabled_nat_bits(sbi, cpc)) {
|
||||
down_write(&nm_i->nat_tree_lock);
|
||||
/*
|
||||
* during unmount, let's flush nat_bits before checking
|
||||
* nat_cnt[DIRTY_NAT].
|
||||
*/
|
||||
if (cpc->reason & CP_UMOUNT) {
|
||||
f2fs_down_write(&nm_i->nat_tree_lock);
|
||||
remove_nats_in_journal(sbi);
|
||||
up_write(&nm_i->nat_tree_lock);
|
||||
f2fs_up_write(&nm_i->nat_tree_lock);
|
||||
}
|
||||
|
||||
if (!nm_i->dirty_nat_cnt)
|
||||
if (!nm_i->nat_cnt[DIRTY_NAT])
|
||||
return 0;
|
||||
|
||||
down_write(&nm_i->nat_tree_lock);
|
||||
f2fs_down_write(&nm_i->nat_tree_lock);
|
||||
|
||||
/*
|
||||
* if there are no enough space in journal to store dirty nat
|
||||
* entries, remove all entries from journal and merge them
|
||||
* into nat entry set.
|
||||
*/
|
||||
if (enabled_nat_bits(sbi, cpc) ||
|
||||
!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
|
||||
if (cpc->reason & CP_UMOUNT ||
|
||||
!__has_cursum_space(journal,
|
||||
nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL))
|
||||
remove_nats_in_journal(sbi);
|
||||
|
||||
while ((found = __gang_lookup_nat_set(nm_i,
|
||||
set_idx, SETVEC_SIZE, setvec))) {
|
||||
unsigned idx;
|
||||
|
||||
set_idx = setvec[found - 1]->set + 1;
|
||||
for (idx = 0; idx < found; idx++)
|
||||
__adjust_nat_entry_set(setvec[idx], &sets,
|
||||
@ -2977,7 +3113,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
|
||||
break;
|
||||
}
|
||||
|
||||
up_write(&nm_i->nat_tree_lock);
|
||||
f2fs_up_write(&nm_i->nat_tree_lock);
|
||||
/* Allow dirty nats by node block allocation in write_begin */
|
||||
|
||||
return err;
|
||||
@ -2992,15 +3128,18 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
|
||||
__u64 cp_ver = cur_cp_version(ckpt);
|
||||
block_t nat_bits_addr;
|
||||
|
||||
if (!enabled_nat_bits(sbi, NULL))
|
||||
return 0;
|
||||
|
||||
nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8);
|
||||
nm_i->nat_bits = f2fs_kvzalloc(sbi,
|
||||
nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL);
|
||||
if (!nm_i->nat_bits)
|
||||
return -ENOMEM;
|
||||
|
||||
nm_i->full_nat_bits = nm_i->nat_bits + 8;
|
||||
nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes;
|
||||
|
||||
if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
|
||||
return 0;
|
||||
|
||||
nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg -
|
||||
nm_i->nat_bits_blocks;
|
||||
for (i = 0; i < nm_i->nat_bits_blocks; i++) {
|
||||
@ -3017,13 +3156,12 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
|
||||
|
||||
cp_ver |= (cur_cp_crc(ckpt) << 32);
|
||||
if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) {
|
||||
disable_nat_bits(sbi, true);
|
||||
clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
|
||||
f2fs_notice(sbi, "Disable nat_bits due to incorrect cp_ver (%llu, %llu)",
|
||||
cp_ver, le64_to_cpu(*(__le64 *)nm_i->nat_bits));
|
||||
return 0;
|
||||
}
|
||||
|
||||
nm_i->full_nat_bits = nm_i->nat_bits + 8;
|
||||
nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes;
|
||||
|
||||
f2fs_notice(sbi, "Found nat_bits in checkpoint");
|
||||
return 0;
|
||||
}
|
||||
@ -3034,7 +3172,7 @@ static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi)
|
||||
unsigned int i = 0;
|
||||
nid_t nid, last_nid;
|
||||
|
||||
if (!enabled_nat_bits(sbi, NULL))
|
||||
if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
|
||||
return;
|
||||
|
||||
for (i = 0; i < nm_i->nat_blocks; i++) {
|
||||
@ -3082,10 +3220,10 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
|
||||
F2FS_RESERVED_NODE_NUM;
|
||||
nm_i->nid_cnt[FREE_NID] = 0;
|
||||
nm_i->nid_cnt[PREALLOC_NID] = 0;
|
||||
nm_i->nat_cnt = 0;
|
||||
nm_i->ram_thresh = DEF_RAM_THRESHOLD;
|
||||
nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
|
||||
nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
|
||||
nm_i->max_rf_node_blocks = DEF_RF_NODE_BLOCKS;
|
||||
|
||||
INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
|
||||
INIT_LIST_HEAD(&nm_i->free_nid_list);
|
||||
@ -3096,14 +3234,11 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
|
||||
|
||||
mutex_init(&nm_i->build_lock);
|
||||
spin_lock_init(&nm_i->nid_list_lock);
|
||||
init_rwsem(&nm_i->nat_tree_lock);
|
||||
init_f2fs_rwsem(&nm_i->nat_tree_lock);
|
||||
|
||||
nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
|
||||
nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
|
||||
version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP);
|
||||
if (!version_bitmap)
|
||||
return -EFAULT;
|
||||
|
||||
nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size,
|
||||
GFP_KERNEL);
|
||||
if (!nm_i->nat_bitmap)
|
||||
@ -3205,7 +3340,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
|
||||
spin_unlock(&nm_i->nid_list_lock);
|
||||
|
||||
/* destroy nat cache */
|
||||
down_write(&nm_i->nat_tree_lock);
|
||||
f2fs_down_write(&nm_i->nat_tree_lock);
|
||||
while ((found = __gang_lookup_nat_cache(nm_i,
|
||||
nid, NATVEC_SIZE, natvec))) {
|
||||
unsigned idx;
|
||||
@ -3219,7 +3354,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
|
||||
__del_from_nat_cache(nm_i, natvec[idx]);
|
||||
}
|
||||
}
|
||||
f2fs_bug_on(sbi, nm_i->nat_cnt);
|
||||
f2fs_bug_on(sbi, nm_i->nat_cnt[TOTAL_NAT]);
|
||||
|
||||
/* destroy nat set cache */
|
||||
nid = 0;
|
||||
@ -3235,7 +3370,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
|
||||
kmem_cache_free(nat_entry_set_slab, setvec[idx]);
|
||||
}
|
||||
}
|
||||
up_write(&nm_i->nat_tree_lock);
|
||||
f2fs_up_write(&nm_i->nat_tree_lock);
|
||||
|
||||
kvfree(nm_i->nat_block_bitmap);
|
||||
if (nm_i->free_nid_bitmap) {
|
||||
@ -3253,7 +3388,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
|
||||
kvfree(nm_i->nat_bitmap_mir);
|
||||
#endif
|
||||
sbi->nm_info = NULL;
|
||||
kvfree(nm_i);
|
||||
kfree(nm_i);
|
||||
}
|
||||
|
||||
int __init f2fs_create_node_manager_caches(void)
|
||||
|
@ -31,6 +31,9 @@
|
||||
/* control total # of nats */
|
||||
#define DEF_NAT_CACHE_THRESHOLD 100000
|
||||
|
||||
/* control total # of node writes used for roll-fowrad recovery */
|
||||
#define DEF_RF_NODE_BLOCKS 0
|
||||
|
||||
/* vector size for gang look-up from nat cache that consists of radix tree */
|
||||
#define NATVEC_SIZE 64
|
||||
#define SETVEC_SIZE 32
|
||||
@ -38,6 +41,9 @@
|
||||
/* return value for read_node_page */
|
||||
#define LOCKED_PAGE 1
|
||||
|
||||
/* check pinned file's alignment status of physical blocks */
|
||||
#define FILE_NOT_ALIGNED 1
|
||||
|
||||
/* For flag in struct node_info */
|
||||
enum {
|
||||
IS_CHECKPOINTED, /* is it checkpointed before? */
|
||||
@ -126,18 +132,13 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne,
|
||||
|
||||
static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
return NM_I(sbi)->dirty_nat_cnt >= NM_I(sbi)->max_nid *
|
||||
return NM_I(sbi)->nat_cnt[DIRTY_NAT] >= NM_I(sbi)->max_nid *
|
||||
NM_I(sbi)->dirty_nats_ratio / 100;
|
||||
}
|
||||
|
||||
static inline bool excess_cached_nats(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD;
|
||||
}
|
||||
|
||||
static inline bool excess_dirty_nodes(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
return get_pages(sbi, F2FS_DIRTY_NODES) >= sbi->blocks_per_seg * 8;
|
||||
return NM_I(sbi)->nat_cnt[TOTAL_NAT] >= DEF_NAT_CACHE_THRESHOLD;
|
||||
}
|
||||
|
||||
enum mem_type {
|
||||
@ -147,6 +148,8 @@ enum mem_type {
|
||||
INO_ENTRIES, /* indicates inode entries */
|
||||
EXTENT_CACHE, /* indicates extent cache */
|
||||
INMEM_PAGES, /* indicates inmemory pages */
|
||||
DISCARD_CACHE, /* indicates memory of cached discard cmds */
|
||||
COMPRESS_PAGE, /* indicates memory of cached compressed pages */
|
||||
BASE_CHECK, /* check kernel status */
|
||||
};
|
||||
|
||||
@ -388,20 +391,6 @@ static inline nid_t get_nid(struct page *p, int off, bool i)
|
||||
* - Mark cold node blocks in their node footer
|
||||
* - Mark cold data pages in page cache
|
||||
*/
|
||||
static inline int is_cold_data(struct page *page)
|
||||
{
|
||||
return PageChecked(page);
|
||||
}
|
||||
|
||||
static inline void set_cold_data(struct page *page)
|
||||
{
|
||||
SetPageChecked(page);
|
||||
}
|
||||
|
||||
static inline void clear_cold_data(struct page *page)
|
||||
{
|
||||
ClearPageChecked(page);
|
||||
}
|
||||
|
||||
static inline int is_node(struct page *page, int type)
|
||||
{
|
||||
@ -413,21 +402,6 @@ static inline int is_node(struct page *page, int type)
|
||||
#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT)
|
||||
#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT)
|
||||
|
||||
static inline int is_inline_node(struct page *page)
|
||||
{
|
||||
return PageChecked(page);
|
||||
}
|
||||
|
||||
static inline void set_inline_node(struct page *page)
|
||||
{
|
||||
SetPageChecked(page);
|
||||
}
|
||||
|
||||
static inline void clear_inline_node(struct page *page)
|
||||
{
|
||||
ClearPageChecked(page);
|
||||
}
|
||||
|
||||
static inline void set_cold_node(struct page *page, bool is_dir)
|
||||
{
|
||||
struct f2fs_node *rn = F2FS_NODE(page);
|
||||
|
@ -45,12 +45,20 @@
|
||||
|
||||
static struct kmem_cache *fsync_entry_slab;
|
||||
|
||||
#ifdef CONFIG_UNICODE
|
||||
extern struct kmem_cache *f2fs_cf_name_slab;
|
||||
#endif
|
||||
|
||||
bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count);
|
||||
|
||||
if (sbi->last_valid_block_count + nalloc > sbi->user_block_count)
|
||||
return false;
|
||||
if (NM_I(sbi)->max_rf_node_blocks &&
|
||||
percpu_counter_sum_positive(&sbi->rf_node_block_count) >=
|
||||
NM_I(sbi)->max_rf_node_blocks)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -77,7 +85,7 @@ static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi,
|
||||
if (IS_ERR(inode))
|
||||
return ERR_CAST(inode);
|
||||
|
||||
err = dquot_initialize(inode);
|
||||
err = f2fs_dquot_initialize(inode);
|
||||
if (err)
|
||||
goto err_out;
|
||||
|
||||
@ -87,7 +95,8 @@ static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi,
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
|
||||
entry = f2fs_kmem_cache_alloc(fsync_entry_slab,
|
||||
GFP_F2FS_ZERO, true, NULL);
|
||||
entry->inode = inode;
|
||||
list_add_tail(&entry->list, head);
|
||||
|
||||
@ -145,7 +154,7 @@ static int init_recovered_filename(const struct inode *dir,
|
||||
f2fs_hash_filename(dir, fname);
|
||||
#ifdef CONFIG_UNICODE
|
||||
/* Case-sensitive match is fine for recovery */
|
||||
kfree(fname->cf_name.name);
|
||||
kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name);
|
||||
fname->cf_name.name = NULL;
|
||||
#endif
|
||||
} else {
|
||||
@ -198,7 +207,7 @@ retry:
|
||||
goto out_put;
|
||||
}
|
||||
|
||||
err = dquot_initialize(einode);
|
||||
err = f2fs_dquot_initialize(einode);
|
||||
if (err) {
|
||||
iput(einode);
|
||||
goto out_put;
|
||||
@ -337,6 +346,19 @@ static int recover_inode(struct inode *inode, struct page *page)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static unsigned int adjust_por_ra_blocks(struct f2fs_sb_info *sbi,
|
||||
unsigned int ra_blocks, unsigned int blkaddr,
|
||||
unsigned int next_blkaddr)
|
||||
{
|
||||
if (blkaddr + 1 == next_blkaddr)
|
||||
ra_blocks = min_t(unsigned int, RECOVERY_MAX_RA_BLOCKS,
|
||||
ra_blocks * 2);
|
||||
else if (next_blkaddr % sbi->blocks_per_seg)
|
||||
ra_blocks = max_t(unsigned int, RECOVERY_MIN_RA_BLOCKS,
|
||||
ra_blocks / 2);
|
||||
return ra_blocks;
|
||||
}
|
||||
|
||||
static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
|
||||
bool check_only)
|
||||
{
|
||||
@ -344,6 +366,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
|
||||
struct page *page = NULL;
|
||||
block_t blkaddr;
|
||||
unsigned int loop_cnt = 0;
|
||||
unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS;
|
||||
unsigned int free_blocks = MAIN_SEGS(sbi) * sbi->blocks_per_seg -
|
||||
valid_user_blocks(sbi);
|
||||
int err = 0;
|
||||
@ -418,11 +441,14 @@ next:
|
||||
break;
|
||||
}
|
||||
|
||||
ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr,
|
||||
next_blkaddr_of_node(page));
|
||||
|
||||
/* check next segment */
|
||||
blkaddr = next_blkaddr_of_node(page);
|
||||
f2fs_put_page(page, 1);
|
||||
|
||||
f2fs_ra_meta_pages_cond(sbi, blkaddr);
|
||||
f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
@ -458,6 +484,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
|
||||
/* Get the previous summary */
|
||||
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
|
||||
struct curseg_info *curseg = CURSEG_I(sbi, i);
|
||||
|
||||
if (curseg->segno == segno) {
|
||||
sum = curseg->sum_blk->entries[blkoff];
|
||||
goto got_it;
|
||||
@ -502,7 +529,7 @@ got_it:
|
||||
if (IS_ERR(inode))
|
||||
return PTR_ERR(inode);
|
||||
|
||||
ret = dquot_initialize(inode);
|
||||
ret = f2fs_dquot_initialize(inode);
|
||||
if (ret) {
|
||||
iput(inode);
|
||||
return ret;
|
||||
@ -554,7 +581,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
|
||||
|
||||
/* step 1: recover xattr */
|
||||
if (IS_INODE(page)) {
|
||||
f2fs_recover_inline_xattr(inode, page);
|
||||
err = f2fs_recover_inline_xattr(inode, page);
|
||||
if (err)
|
||||
goto out;
|
||||
} else if (f2fs_has_xattr_block(ofs_of_node(page))) {
|
||||
err = f2fs_recover_xattr_data(inode, page);
|
||||
if (!err)
|
||||
@ -563,8 +592,12 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
|
||||
}
|
||||
|
||||
/* step 2: recover inline data */
|
||||
if (f2fs_recover_inline_data(inode, page))
|
||||
err = f2fs_recover_inline_data(inode, page);
|
||||
if (err) {
|
||||
if (err == 1)
|
||||
err = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* step 3: recover data indices */
|
||||
start = f2fs_start_bidx_of_node(ofs_of_node(page), inode);
|
||||
@ -583,7 +616,7 @@ retry_dn:
|
||||
|
||||
f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true);
|
||||
|
||||
err = f2fs_get_node_info(sbi, dn.nid, &ni);
|
||||
err = f2fs_get_node_info(sbi, dn.nid, &ni, false);
|
||||
if (err)
|
||||
goto err;
|
||||
|
||||
@ -692,6 +725,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
|
||||
struct page *page = NULL;
|
||||
int err = 0;
|
||||
block_t blkaddr;
|
||||
unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS;
|
||||
|
||||
/* get node pages in the current segment */
|
||||
curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
|
||||
@ -703,8 +737,6 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
|
||||
if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR))
|
||||
break;
|
||||
|
||||
f2fs_ra_meta_pages_cond(sbi, blkaddr);
|
||||
|
||||
page = f2fs_get_tmp_page(sbi, blkaddr);
|
||||
if (IS_ERR(page)) {
|
||||
err = PTR_ERR(page);
|
||||
@ -747,12 +779,17 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
|
||||
if (entry->blkaddr == blkaddr)
|
||||
list_move_tail(&entry->list, tmp_inode_list);
|
||||
next:
|
||||
ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr,
|
||||
next_blkaddr_of_node(page));
|
||||
|
||||
/* check next segment */
|
||||
blkaddr = next_blkaddr_of_node(page);
|
||||
f2fs_put_page(page, 1);
|
||||
|
||||
f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks);
|
||||
}
|
||||
if (!err)
|
||||
f2fs_allocate_new_segments(sbi, NO_CHECK_TYPE);
|
||||
f2fs_allocate_new_segments(sbi);
|
||||
return err;
|
||||
}
|
||||
|
||||
@ -774,25 +811,16 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
|
||||
}
|
||||
|
||||
#ifdef CONFIG_QUOTA
|
||||
/* Needed for iput() to work correctly and not trash data */
|
||||
sbi->sb->s_flags |= MS_ACTIVE;
|
||||
/* Turn on quotas so that they are updated correctly */
|
||||
quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY);
|
||||
#endif
|
||||
|
||||
fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
|
||||
sizeof(struct fsync_inode_entry));
|
||||
if (!fsync_entry_slab) {
|
||||
err = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&inode_list);
|
||||
INIT_LIST_HEAD(&tmp_inode_list);
|
||||
INIT_LIST_HEAD(&dir_list);
|
||||
|
||||
/* prevent checkpoint */
|
||||
mutex_lock(&sbi->cp_mutex);
|
||||
f2fs_down_write(&sbi->cp_global_sem);
|
||||
|
||||
/* step #1: find fsynced inode numbers */
|
||||
err = find_fsync_dnodes(sbi, &inode_list, check_only);
|
||||
@ -810,10 +838,8 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
|
||||
err = recover_data(sbi, &inode_list, &tmp_inode_list, &dir_list);
|
||||
if (!err)
|
||||
f2fs_bug_on(sbi, !list_empty(&inode_list));
|
||||
else {
|
||||
/* restore s_flags to let iput() trash data */
|
||||
sbi->sb->s_flags = s_flags;
|
||||
}
|
||||
else
|
||||
f2fs_bug_on(sbi, sbi->sb->s_flags & SB_ACTIVE);
|
||||
skip:
|
||||
destroy_fsync_dnodes(&inode_list, err);
|
||||
destroy_fsync_dnodes(&tmp_inode_list, err);
|
||||
@ -828,7 +854,7 @@ skip:
|
||||
} else {
|
||||
clear_sbi_flag(sbi, SBI_POR_DOING);
|
||||
}
|
||||
mutex_unlock(&sbi->cp_mutex);
|
||||
f2fs_up_write(&sbi->cp_global_sem);
|
||||
|
||||
/* let's drop all the directory inodes for clean checkpoint */
|
||||
destroy_fsync_dnodes(&dir_list, err);
|
||||
@ -844,8 +870,6 @@ skip:
|
||||
}
|
||||
}
|
||||
|
||||
kmem_cache_destroy(fsync_entry_slab);
|
||||
out:
|
||||
#ifdef CONFIG_QUOTA
|
||||
/* Turn quotas off */
|
||||
if (quota_enabled)
|
||||
@ -853,5 +877,19 @@ out:
|
||||
#endif
|
||||
sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
|
||||
|
||||
return ret ? ret: err;
|
||||
return ret ? ret : err;
|
||||
}
|
||||
|
||||
int __init f2fs_create_recovery_cache(void)
|
||||
{
|
||||
fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
|
||||
sizeof(struct fsync_inode_entry));
|
||||
if (!fsync_entry_slab)
|
||||
return -ENOMEM;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void f2fs_destroy_recovery_cache(void)
|
||||
{
|
||||
kmem_cache_destroy(fsync_entry_slab);
|
||||
}
|
||||
|
1058
fs/f2fs/segment.c
1058
fs/f2fs/segment.c
File diff suppressed because it is too large
Load Diff
@ -16,13 +16,20 @@
|
||||
#define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */
|
||||
|
||||
#define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */
|
||||
#define F2FS_MIN_META_SEGMENTS 8 /* SB + 2 (CP + SIT + NAT) + SSA */
|
||||
|
||||
/* L: Logical segment # in volume, R: Relative segment # in main area */
|
||||
#define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno)
|
||||
#define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno)
|
||||
|
||||
#define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA)
|
||||
#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE)
|
||||
#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE && (t) <= CURSEG_COLD_NODE)
|
||||
|
||||
static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
|
||||
unsigned short seg_type)
|
||||
{
|
||||
f2fs_bug_on(sbi, seg_type >= NR_PERSISTENT_LOG);
|
||||
}
|
||||
|
||||
#define IS_HOT(t) ((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA)
|
||||
#define IS_WARM(t) ((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA)
|
||||
@ -34,7 +41,9 @@
|
||||
((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \
|
||||
((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \
|
||||
((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \
|
||||
((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
|
||||
((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno) || \
|
||||
((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno) || \
|
||||
((seg) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno))
|
||||
|
||||
#define IS_CURSEC(sbi, secno) \
|
||||
(((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \
|
||||
@ -48,7 +57,11 @@
|
||||
((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \
|
||||
(sbi)->segs_per_sec) || \
|
||||
((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \
|
||||
(sbi)->segs_per_sec)) \
|
||||
(sbi)->segs_per_sec) || \
|
||||
((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno / \
|
||||
(sbi)->segs_per_sec) || \
|
||||
((secno) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno / \
|
||||
(sbi)->segs_per_sec))
|
||||
|
||||
#define MAIN_BLKADDR(sbi) \
|
||||
(SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \
|
||||
@ -129,23 +142,28 @@ enum {
|
||||
};
|
||||
|
||||
/*
|
||||
* In the victim_sel_policy->alloc_mode, there are two block allocation modes.
|
||||
* In the victim_sel_policy->alloc_mode, there are three block allocation modes.
|
||||
* LFS writes data sequentially with cleaning operations.
|
||||
* SSR (Slack Space Recycle) reuses obsolete space without cleaning operations.
|
||||
* AT_SSR (Age Threshold based Slack Space Recycle) merges fragments into
|
||||
* fragmented segment which has similar aging degree.
|
||||
*/
|
||||
enum {
|
||||
LFS = 0,
|
||||
SSR
|
||||
SSR,
|
||||
AT_SSR,
|
||||
};
|
||||
|
||||
/*
|
||||
* In the victim_sel_policy->gc_mode, there are two gc, aka cleaning, modes.
|
||||
* In the victim_sel_policy->gc_mode, there are three gc, aka cleaning, modes.
|
||||
* GC_CB is based on cost-benefit algorithm.
|
||||
* GC_GREEDY is based on greedy algorithm.
|
||||
* GC_AT is based on age-threshold algorithm.
|
||||
*/
|
||||
enum {
|
||||
GC_CB = 0,
|
||||
GC_GREEDY,
|
||||
GC_AT,
|
||||
ALLOC_NEXT,
|
||||
FLUSH_DEVICE,
|
||||
MAX_GC_POLICY,
|
||||
@ -154,24 +172,28 @@ enum {
|
||||
/*
|
||||
* BG_GC means the background cleaning job.
|
||||
* FG_GC means the on-demand cleaning job.
|
||||
* FORCE_FG_GC means on-demand cleaning job in background.
|
||||
*/
|
||||
enum {
|
||||
BG_GC = 0,
|
||||
FG_GC,
|
||||
FORCE_FG_GC,
|
||||
};
|
||||
|
||||
/* for a function parameter to select a victim segment */
|
||||
struct victim_sel_policy {
|
||||
int alloc_mode; /* LFS or SSR */
|
||||
int gc_mode; /* GC_CB or GC_GREEDY */
|
||||
unsigned long *dirty_segmap; /* dirty segment bitmap */
|
||||
unsigned int max_search; /* maximum # of segments to search */
|
||||
unsigned long *dirty_bitmap; /* dirty segment/section bitmap */
|
||||
unsigned int max_search; /*
|
||||
* maximum # of segments/sections
|
||||
* to search
|
||||
*/
|
||||
unsigned int offset; /* last scanned bitmap offset */
|
||||
unsigned int ofs_unit; /* bitmap search unit */
|
||||
unsigned int min_cost; /* minimum cost */
|
||||
unsigned long long oldest_age; /* oldest age of segments having the same min cost */
|
||||
unsigned int min_segno; /* segment # having min. cost */
|
||||
unsigned long long age; /* mtime of GCed section*/
|
||||
unsigned long long age_threshold;/* age threshold */
|
||||
};
|
||||
|
||||
struct seg_entry {
|
||||
@ -184,7 +206,7 @@ struct seg_entry {
|
||||
unsigned char *cur_valid_map_mir; /* mirror of current valid bitmap */
|
||||
#endif
|
||||
/*
|
||||
* # of valid blocks and the validity bitmap stored in the the last
|
||||
* # of valid blocks and the validity bitmap stored in the last
|
||||
* checkpoint pack. This information is used by the SSR mode.
|
||||
*/
|
||||
unsigned char *ckpt_valid_map; /* validity bitmap of blocks last cp */
|
||||
@ -237,6 +259,8 @@ struct sit_info {
|
||||
unsigned long long mounted_time; /* mount time */
|
||||
unsigned long long min_mtime; /* min. modification time */
|
||||
unsigned long long max_mtime; /* max. modification time */
|
||||
unsigned long long dirty_min_mtime; /* rerange candidates in GC_AT */
|
||||
unsigned long long dirty_max_mtime; /* rerange candidates in GC_AT */
|
||||
|
||||
unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */
|
||||
};
|
||||
@ -266,6 +290,7 @@ enum dirty_type {
|
||||
struct dirty_seglist_info {
|
||||
const struct victim_selection *v_ops; /* victim selction operation */
|
||||
unsigned long *dirty_segmap[NR_DIRTY_TYPE];
|
||||
unsigned long *dirty_secmap;
|
||||
struct mutex seglist_lock; /* lock for segment bitmaps */
|
||||
int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */
|
||||
unsigned long *victim_secmap; /* background GC victims */
|
||||
@ -274,7 +299,7 @@ struct dirty_seglist_info {
|
||||
/* victim selection function for cleaning and SSR */
|
||||
struct victim_selection {
|
||||
int (*get_victim)(struct f2fs_sb_info *, unsigned int *,
|
||||
int, int, char);
|
||||
int, int, char, unsigned long long);
|
||||
};
|
||||
|
||||
/* for active log information */
|
||||
@ -284,10 +309,13 @@ struct curseg_info {
|
||||
struct rw_semaphore journal_rwsem; /* protect journal area */
|
||||
struct f2fs_journal *journal; /* cached journal info */
|
||||
unsigned char alloc_type; /* current allocation type */
|
||||
unsigned short seg_type; /* segment type like CURSEG_XXX_TYPE */
|
||||
unsigned int segno; /* current segment number */
|
||||
unsigned short next_blkoff; /* next block offset to write */
|
||||
unsigned int zone; /* current zone number */
|
||||
unsigned int next_segno; /* preallocated segment */
|
||||
int fragment_remained_chunk; /* remained block size in a chunk for block fragmentation mode */
|
||||
bool inited; /* indicate inmem log is inited */
|
||||
};
|
||||
|
||||
struct sit_entry_set {
|
||||
@ -301,8 +329,6 @@ struct sit_entry_set {
|
||||
*/
|
||||
static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type)
|
||||
{
|
||||
if (type == CURSEG_COLD_DATA_PINNED)
|
||||
type = CURSEG_COLD_DATA;
|
||||
return (struct curseg_info *)(SM_I(sbi)->curseg_array + type);
|
||||
}
|
||||
|
||||
@ -334,8 +360,20 @@ static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi,
|
||||
}
|
||||
|
||||
static inline unsigned int get_ckpt_valid_blocks(struct f2fs_sb_info *sbi,
|
||||
unsigned int segno)
|
||||
unsigned int segno, bool use_section)
|
||||
{
|
||||
if (use_section && __is_large_section(sbi)) {
|
||||
unsigned int start_segno = START_SEGNO(segno);
|
||||
unsigned int blocks = 0;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < sbi->segs_per_sec; i++, start_segno++) {
|
||||
struct seg_entry *se = get_seg_entry(sbi, start_segno);
|
||||
|
||||
blocks += se->ckpt_valid_blocks;
|
||||
}
|
||||
return blocks;
|
||||
}
|
||||
return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
|
||||
}
|
||||
|
||||
@ -407,6 +445,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
|
||||
unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
|
||||
unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno);
|
||||
unsigned int next;
|
||||
unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi, segno);
|
||||
|
||||
spin_lock(&free_i->segmap_lock);
|
||||
clear_bit(segno, free_i->free_segmap);
|
||||
@ -414,7 +453,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
|
||||
|
||||
next = find_next_bit(free_i->free_segmap,
|
||||
start_segno + sbi->segs_per_sec, start_segno);
|
||||
if (next >= start_segno + sbi->segs_per_sec) {
|
||||
if (next >= start_segno + usable_segs) {
|
||||
clear_bit(secno, free_i->free_secmap);
|
||||
free_i->free_sections++;
|
||||
}
|
||||
@ -434,22 +473,23 @@ static inline void __set_inuse(struct f2fs_sb_info *sbi,
|
||||
}
|
||||
|
||||
static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
|
||||
unsigned int segno)
|
||||
unsigned int segno, bool inmem)
|
||||
{
|
||||
struct free_segmap_info *free_i = FREE_I(sbi);
|
||||
unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
|
||||
unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno);
|
||||
unsigned int next;
|
||||
unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi, segno);
|
||||
|
||||
spin_lock(&free_i->segmap_lock);
|
||||
if (test_and_clear_bit(segno, free_i->free_segmap)) {
|
||||
free_i->free_segments++;
|
||||
|
||||
if (IS_CURSEC(sbi, secno))
|
||||
if (!inmem && IS_CURSEC(sbi, secno))
|
||||
goto skip_free;
|
||||
next = find_next_bit(free_i->free_segmap,
|
||||
start_segno + sbi->segs_per_sec, start_segno);
|
||||
if (next >= start_segno + sbi->segs_per_sec) {
|
||||
if (next >= start_segno + usable_segs) {
|
||||
if (test_and_clear_bit(secno, free_i->free_secmap))
|
||||
free_i->free_sections++;
|
||||
}
|
||||
@ -496,9 +536,10 @@ static inline unsigned int free_segments(struct f2fs_sb_info *sbi)
|
||||
return FREE_I(sbi)->free_segments;
|
||||
}
|
||||
|
||||
static inline int reserved_segments(struct f2fs_sb_info *sbi)
|
||||
static inline unsigned int reserved_segments(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
return SM_I(sbi)->reserved_segments;
|
||||
return SM_I(sbi)->reserved_segments +
|
||||
SM_I(sbi)->additional_reserved_segments;
|
||||
}
|
||||
|
||||
static inline unsigned int free_sections(struct f2fs_sb_info *sbi)
|
||||
@ -528,7 +569,7 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi)
|
||||
|
||||
static inline int reserved_sections(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi));
|
||||
return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi));
|
||||
}
|
||||
|
||||
static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi)
|
||||
@ -542,8 +583,8 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi)
|
||||
/* check current node segment */
|
||||
for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) {
|
||||
segno = CURSEG_I(sbi, i)->segno;
|
||||
left_blocks = sbi->blocks_per_seg -
|
||||
get_seg_entry(sbi, segno)->ckpt_valid_blocks;
|
||||
left_blocks = f2fs_usable_blks_in_seg(sbi, segno) -
|
||||
get_seg_entry(sbi, segno)->ckpt_valid_blocks;
|
||||
|
||||
if (node_blocks > left_blocks)
|
||||
return false;
|
||||
@ -551,7 +592,7 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi)
|
||||
|
||||
/* check current data segment */
|
||||
segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno;
|
||||
left_blocks = sbi->blocks_per_seg -
|
||||
left_blocks = f2fs_usable_blks_in_seg(sbi, segno) -
|
||||
get_seg_entry(sbi, segno)->ckpt_valid_blocks;
|
||||
if (dent_blocks > left_blocks)
|
||||
return false;
|
||||
@ -610,7 +651,9 @@ static inline int utilization(struct f2fs_sb_info *sbi)
|
||||
* pages over min_fsync_blocks. (=default option)
|
||||
* F2FS_IPU_ASYNC - do IPU given by asynchronous write requests.
|
||||
* F2FS_IPU_NOCACHE - disable IPU bio cache.
|
||||
* F2FS_IPUT_DISABLE - disable IPU. (=default option in LFS mode)
|
||||
* F2FS_IPU_HONOR_OPU_WRITE - use OPU write prior to IPU write if inode has
|
||||
* FI_OPU_WRITE flag.
|
||||
* F2FS_IPU_DISABLE - disable IPU. (=default option in LFS mode)
|
||||
*/
|
||||
#define DEF_MIN_IPU_UTIL 70
|
||||
#define DEF_MIN_FSYNC_BLOCKS 20
|
||||
@ -626,6 +669,7 @@ enum {
|
||||
F2FS_IPU_FSYNC,
|
||||
F2FS_IPU_ASYNC,
|
||||
F2FS_IPU_NOCACHE,
|
||||
F2FS_IPU_HONOR_OPU_WRITE,
|
||||
};
|
||||
|
||||
static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi,
|
||||
@ -673,21 +717,22 @@ static inline int check_block_count(struct f2fs_sb_info *sbi,
|
||||
bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false;
|
||||
int valid_blocks = 0;
|
||||
int cur_pos = 0, next_pos;
|
||||
unsigned int usable_blks_per_seg = f2fs_usable_blks_in_seg(sbi, segno);
|
||||
|
||||
/* check bitmap with valid block count */
|
||||
do {
|
||||
if (is_valid) {
|
||||
next_pos = find_next_zero_bit_le(&raw_sit->valid_map,
|
||||
sbi->blocks_per_seg,
|
||||
usable_blks_per_seg,
|
||||
cur_pos);
|
||||
valid_blocks += next_pos - cur_pos;
|
||||
} else
|
||||
next_pos = find_next_bit_le(&raw_sit->valid_map,
|
||||
sbi->blocks_per_seg,
|
||||
usable_blks_per_seg,
|
||||
cur_pos);
|
||||
cur_pos = next_pos;
|
||||
is_valid = !is_valid;
|
||||
} while (cur_pos < sbi->blocks_per_seg);
|
||||
} while (cur_pos < usable_blks_per_seg);
|
||||
|
||||
if (unlikely(GET_SIT_VBLOCKS(raw_sit) != valid_blocks)) {
|
||||
f2fs_err(sbi, "Mismatch valid blocks %d vs. %d",
|
||||
@ -696,8 +741,13 @@ static inline int check_block_count(struct f2fs_sb_info *sbi,
|
||||
return -EFSCORRUPTED;
|
||||
}
|
||||
|
||||
if (usable_blks_per_seg < sbi->blocks_per_seg)
|
||||
f2fs_bug_on(sbi, find_next_bit_le(&raw_sit->valid_map,
|
||||
sbi->blocks_per_seg,
|
||||
usable_blks_per_seg) != sbi->blocks_per_seg);
|
||||
|
||||
/* check segment usage, and check boundary of a given segment number */
|
||||
if (unlikely(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg
|
||||
if (unlikely(GET_SIT_VBLOCKS(raw_sit) > usable_blks_per_seg
|
||||
|| segno > TOTAL_SEGS(sbi) - 1)) {
|
||||
f2fs_err(sbi, "Wrong valid blocks %d or segno %u",
|
||||
GET_SIT_VBLOCKS(raw_sit), segno);
|
||||
|
@ -18,9 +18,7 @@ static unsigned int shrinker_run_no;
|
||||
|
||||
static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
long count = NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt;
|
||||
|
||||
return count > 0 ? count : 0;
|
||||
return NM_I(sbi)->nat_cnt[RECLAIMABLE_NAT];
|
||||
}
|
||||
|
||||
static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
|
||||
|
1108
fs/f2fs/super.c
1108
fs/f2fs/super.c
File diff suppressed because it is too large
Load Diff
700
fs/f2fs/sysfs.c
700
fs/f2fs/sysfs.c
@ -11,10 +11,13 @@
|
||||
#include <linux/f2fs_fs.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/unicode.h>
|
||||
#include <linux/ioprio.h>
|
||||
#include <linux/sysfs.h>
|
||||
|
||||
#include "f2fs.h"
|
||||
#include "segment.h"
|
||||
#include "gc.h"
|
||||
#include "iostat.h"
|
||||
#include <trace/events/f2fs.h>
|
||||
|
||||
static struct proc_dir_entry *f2fs_proc_root;
|
||||
@ -27,13 +30,25 @@ enum {
|
||||
NM_INFO, /* struct f2fs_nm_info */
|
||||
F2FS_SBI, /* struct f2fs_sb_info */
|
||||
#ifdef CONFIG_F2FS_STAT_FS
|
||||
STAT_INFO, /* struct f2fs_stat_info */
|
||||
STAT_INFO, /* struct f2fs_stat_info */
|
||||
#endif
|
||||
#ifdef CONFIG_F2FS_FAULT_INJECTION
|
||||
FAULT_INFO_RATE, /* struct f2fs_fault_info */
|
||||
FAULT_INFO_TYPE, /* struct f2fs_fault_info */
|
||||
#endif
|
||||
RESERVED_BLOCKS, /* struct f2fs_sb_info */
|
||||
CPRC_INFO, /* struct ckpt_req_control */
|
||||
ATGC_INFO, /* struct atgc_management */
|
||||
};
|
||||
|
||||
static const char *gc_mode_names[MAX_GC_MODE] = {
|
||||
"GC_NORMAL",
|
||||
"GC_IDLE_CB",
|
||||
"GC_IDLE_GREEDY",
|
||||
"GC_IDLE_AT",
|
||||
"GC_URGENT_HIGH",
|
||||
"GC_URGENT_LOW",
|
||||
"GC_URGENT_MID"
|
||||
};
|
||||
|
||||
struct f2fs_attr {
|
||||
@ -70,6 +85,10 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
|
||||
else if (struct_type == STAT_INFO)
|
||||
return (unsigned char *)F2FS_STAT(sbi);
|
||||
#endif
|
||||
else if (struct_type == CPRC_INFO)
|
||||
return (unsigned char *)&sbi->cprc_info;
|
||||
else if (struct_type == ATGC_INFO)
|
||||
return (unsigned char *)&sbi->am;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -87,28 +106,42 @@ static ssize_t free_segments_show(struct f2fs_attr *a,
|
||||
(unsigned long long)(free_segments(sbi)));
|
||||
}
|
||||
|
||||
static ssize_t ovp_segments_show(struct f2fs_attr *a,
|
||||
struct f2fs_sb_info *sbi, char *buf)
|
||||
{
|
||||
return sprintf(buf, "%llu\n",
|
||||
(unsigned long long)(overprovision_segments(sbi)));
|
||||
}
|
||||
|
||||
static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
|
||||
struct f2fs_sb_info *sbi, char *buf)
|
||||
{
|
||||
struct super_block *sb = sbi->sb;
|
||||
|
||||
if (!sb->s_bdev->bd_part)
|
||||
return sprintf(buf, "0\n");
|
||||
|
||||
return sprintf(buf, "%llu\n",
|
||||
(unsigned long long)(sbi->kbytes_written +
|
||||
BD_PART_WRITTEN(sbi)));
|
||||
((f2fs_get_sectors_written(sbi) -
|
||||
sbi->sectors_written_start) >> 1)));
|
||||
}
|
||||
|
||||
static ssize_t sb_status_show(struct f2fs_attr *a,
|
||||
struct f2fs_sb_info *sbi, char *buf)
|
||||
{
|
||||
return sprintf(buf, "%lx\n", sbi->s_flag);
|
||||
}
|
||||
|
||||
static ssize_t pending_discard_show(struct f2fs_attr *a,
|
||||
struct f2fs_sb_info *sbi, char *buf)
|
||||
{
|
||||
if (!SM_I(sbi)->dcc_info)
|
||||
return -EINVAL;
|
||||
return sprintf(buf, "%llu\n", (unsigned long long)atomic_read(
|
||||
&SM_I(sbi)->dcc_info->discard_cmd_cnt));
|
||||
}
|
||||
|
||||
static ssize_t features_show(struct f2fs_attr *a,
|
||||
struct f2fs_sb_info *sbi, char *buf)
|
||||
{
|
||||
struct super_block *sb = sbi->sb;
|
||||
int len = 0;
|
||||
|
||||
if (!sb->s_bdev->bd_part)
|
||||
return sprintf(buf, "0\n");
|
||||
|
||||
if (f2fs_sb_has_encrypt(sbi))
|
||||
len += scnprintf(buf, PAGE_SIZE - len, "%s",
|
||||
"encryption");
|
||||
@ -145,6 +178,9 @@ static ssize_t features_show(struct f2fs_attr *a,
|
||||
if (f2fs_sb_has_casefold(sbi))
|
||||
len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
|
||||
len ? ", " : "", "casefold");
|
||||
if (f2fs_sb_has_readonly(sbi))
|
||||
len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
|
||||
len ? ", " : "", "readonly");
|
||||
if (f2fs_sb_has_compression(sbi))
|
||||
len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
|
||||
len ? ", " : "", "compression");
|
||||
@ -225,6 +261,13 @@ static ssize_t avg_vblocks_show(struct f2fs_attr *a,
|
||||
}
|
||||
#endif
|
||||
|
||||
static ssize_t main_blkaddr_show(struct f2fs_attr *a,
|
||||
struct f2fs_sb_info *sbi, char *buf)
|
||||
{
|
||||
return snprintf(buf, PAGE_SIZE, "%llu\n",
|
||||
(unsigned long long)MAIN_BLKADDR(sbi));
|
||||
}
|
||||
|
||||
static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
|
||||
struct f2fs_sb_info *sbi, char *buf)
|
||||
{
|
||||
@ -256,6 +299,50 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
|
||||
return len;
|
||||
}
|
||||
|
||||
if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) {
|
||||
struct ckpt_req_control *cprc = &sbi->cprc_info;
|
||||
int len = 0;
|
||||
int class = IOPRIO_PRIO_CLASS(cprc->ckpt_thread_ioprio);
|
||||
int data = IOPRIO_PRIO_DATA(cprc->ckpt_thread_ioprio);
|
||||
|
||||
if (class == IOPRIO_CLASS_RT)
|
||||
len += scnprintf(buf + len, PAGE_SIZE - len, "rt,");
|
||||
else if (class == IOPRIO_CLASS_BE)
|
||||
len += scnprintf(buf + len, PAGE_SIZE - len, "be,");
|
||||
else
|
||||
return -EINVAL;
|
||||
|
||||
len += scnprintf(buf + len, PAGE_SIZE - len, "%d\n", data);
|
||||
return len;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_F2FS_FS_COMPRESSION
|
||||
if (!strcmp(a->attr.name, "compr_written_block"))
|
||||
return snprintf(buf, PAGE_SIZE, "%llu\n",
|
||||
sbi->compr_written_block);
|
||||
|
||||
if (!strcmp(a->attr.name, "compr_saved_block"))
|
||||
return snprintf(buf, PAGE_SIZE, "%llu\n",
|
||||
sbi->compr_saved_block);
|
||||
|
||||
if (!strcmp(a->attr.name, "compr_new_inode"))
|
||||
return snprintf(buf, PAGE_SIZE, "%u\n",
|
||||
sbi->compr_new_inode);
|
||||
#endif
|
||||
|
||||
if (!strcmp(a->attr.name, "gc_urgent"))
|
||||
return snprintf(buf, PAGE_SIZE, "%s\n",
|
||||
gc_mode_names[sbi->gc_mode]);
|
||||
|
||||
if (!strcmp(a->attr.name, "gc_segment_mode"))
|
||||
return snprintf(buf, PAGE_SIZE, "%s\n",
|
||||
gc_mode_names[sbi->gc_segment_mode]);
|
||||
|
||||
if (!strcmp(a->attr.name, "gc_reclaimed_segments")) {
|
||||
return snprintf(buf, PAGE_SIZE, "%u\n",
|
||||
sbi->gc_reclaimed_segs[sbi->gc_segment_mode]);
|
||||
}
|
||||
|
||||
ui = (unsigned int *)(ptr + a->offset);
|
||||
|
||||
return sprintf(buf, "%u\n", *ui);
|
||||
@ -292,10 +379,10 @@ static ssize_t __sbi_store(struct f2fs_attr *a,
|
||||
set = false;
|
||||
}
|
||||
|
||||
if (strlen(name) >= F2FS_EXTENSION_LEN)
|
||||
if (!strlen(name) || strlen(name) >= F2FS_EXTENSION_LEN)
|
||||
return -EINVAL;
|
||||
|
||||
down_write(&sbi->sb_lock);
|
||||
f2fs_down_write(&sbi->sb_lock);
|
||||
|
||||
ret = f2fs_update_extension_list(sbi, name, hot, set);
|
||||
if (ret)
|
||||
@ -305,10 +392,42 @@ static ssize_t __sbi_store(struct f2fs_attr *a,
|
||||
if (ret)
|
||||
f2fs_update_extension_list(sbi, name, hot, !set);
|
||||
out:
|
||||
up_write(&sbi->sb_lock);
|
||||
f2fs_up_write(&sbi->sb_lock);
|
||||
return ret ? ret : count;
|
||||
}
|
||||
|
||||
if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) {
|
||||
const char *name = strim((char *)buf);
|
||||
struct ckpt_req_control *cprc = &sbi->cprc_info;
|
||||
int class;
|
||||
long data;
|
||||
int ret;
|
||||
|
||||
if (!strncmp(name, "rt,", 3))
|
||||
class = IOPRIO_CLASS_RT;
|
||||
else if (!strncmp(name, "be,", 3))
|
||||
class = IOPRIO_CLASS_BE;
|
||||
else
|
||||
return -EINVAL;
|
||||
|
||||
name += 3;
|
||||
ret = kstrtol(name, 10, &data);
|
||||
if (ret)
|
||||
return ret;
|
||||
if (data >= IOPRIO_BE_NR || data < 0)
|
||||
return -EINVAL;
|
||||
|
||||
cprc->ckpt_thread_ioprio = IOPRIO_PRIO_VALUE(class, data);
|
||||
if (test_opt(sbi, MERGE_CHECKPOINT)) {
|
||||
ret = set_task_ioprio(cprc->f2fs_issue_ckpt,
|
||||
cprc->ckpt_thread_ioprio);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
ui = (unsigned int *)(ptr + a->offset);
|
||||
|
||||
ret = kstrtoul(skip_spaces(buf), 0, &t);
|
||||
@ -323,7 +442,9 @@ out:
|
||||
if (a->struct_type == RESERVED_BLOCKS) {
|
||||
spin_lock(&sbi->stat_lock);
|
||||
if (t > (unsigned long)(sbi->user_block_count -
|
||||
F2FS_OPTION(sbi).root_reserved_blocks)) {
|
||||
F2FS_OPTION(sbi).root_reserved_blocks -
|
||||
sbi->blocks_per_seg *
|
||||
SM_I(sbi)->additional_reserved_segments)) {
|
||||
spin_unlock(&sbi->stat_lock);
|
||||
return -EINVAL;
|
||||
}
|
||||
@ -337,6 +458,8 @@ out:
|
||||
if (!strcmp(a->attr.name, "discard_granularity")) {
|
||||
if (t == 0 || t > MAX_PLIST_NUM)
|
||||
return -EINVAL;
|
||||
if (!f2fs_block_unit_discard(sbi))
|
||||
return -EINVAL;
|
||||
if (t == *ui)
|
||||
return count;
|
||||
*ui = t;
|
||||
@ -352,29 +475,55 @@ out:
|
||||
return -EINVAL;
|
||||
|
||||
if (!strcmp(a->attr.name, "gc_urgent")) {
|
||||
if (t >= 1) {
|
||||
sbi->gc_mode = GC_URGENT;
|
||||
if (t == 0) {
|
||||
sbi->gc_mode = GC_NORMAL;
|
||||
} else if (t == 1) {
|
||||
sbi->gc_mode = GC_URGENT_HIGH;
|
||||
if (sbi->gc_thread) {
|
||||
sbi->gc_thread->gc_wake = 1;
|
||||
wake_up_interruptible_all(
|
||||
&sbi->gc_thread->gc_wait_queue_head);
|
||||
wake_up_discard_thread(sbi, true);
|
||||
}
|
||||
} else if (t == 2) {
|
||||
sbi->gc_mode = GC_URGENT_LOW;
|
||||
} else if (t == 3) {
|
||||
sbi->gc_mode = GC_URGENT_MID;
|
||||
if (sbi->gc_thread) {
|
||||
sbi->gc_thread->gc_wake = 1;
|
||||
wake_up_interruptible_all(
|
||||
&sbi->gc_thread->gc_wait_queue_head);
|
||||
}
|
||||
} else {
|
||||
return -EINVAL;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
if (!strcmp(a->attr.name, "gc_idle")) {
|
||||
if (t == GC_IDLE_CB) {
|
||||
sbi->gc_mode = GC_IDLE_CB;
|
||||
} else if (t == GC_IDLE_GREEDY) {
|
||||
sbi->gc_mode = GC_IDLE_GREEDY;
|
||||
} else if (t == GC_IDLE_AT) {
|
||||
if (!sbi->am.atgc_enabled)
|
||||
return -EINVAL;
|
||||
sbi->gc_mode = GC_IDLE_AT;
|
||||
} else {
|
||||
sbi->gc_mode = GC_NORMAL;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
if (!strcmp(a->attr.name, "gc_idle")) {
|
||||
if (t == GC_IDLE_CB)
|
||||
sbi->gc_mode = GC_IDLE_CB;
|
||||
else if (t == GC_IDLE_GREEDY)
|
||||
sbi->gc_mode = GC_IDLE_GREEDY;
|
||||
else
|
||||
sbi->gc_mode = GC_NORMAL;
|
||||
|
||||
if (!strcmp(a->attr.name, "gc_urgent_high_remaining")) {
|
||||
spin_lock(&sbi->gc_urgent_high_lock);
|
||||
sbi->gc_urgent_high_limited = t != 0;
|
||||
sbi->gc_urgent_high_remaining = t;
|
||||
spin_unlock(&sbi->gc_urgent_high_lock);
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_F2FS_IOSTAT
|
||||
if (!strcmp(a->attr.name, "iostat_enable")) {
|
||||
sbi->iostat_enable = !!t;
|
||||
if (!sbi->iostat_enable)
|
||||
@ -390,6 +539,70 @@ out:
|
||||
spin_unlock(&sbi->iostat_lock);
|
||||
return count;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_F2FS_FS_COMPRESSION
|
||||
if (!strcmp(a->attr.name, "compr_written_block") ||
|
||||
!strcmp(a->attr.name, "compr_saved_block")) {
|
||||
if (t != 0)
|
||||
return -EINVAL;
|
||||
sbi->compr_written_block = 0;
|
||||
sbi->compr_saved_block = 0;
|
||||
return count;
|
||||
}
|
||||
|
||||
if (!strcmp(a->attr.name, "compr_new_inode")) {
|
||||
if (t != 0)
|
||||
return -EINVAL;
|
||||
sbi->compr_new_inode = 0;
|
||||
return count;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!strcmp(a->attr.name, "atgc_candidate_ratio")) {
|
||||
if (t > 100)
|
||||
return -EINVAL;
|
||||
sbi->am.candidate_ratio = t;
|
||||
return count;
|
||||
}
|
||||
|
||||
if (!strcmp(a->attr.name, "atgc_age_weight")) {
|
||||
if (t > 100)
|
||||
return -EINVAL;
|
||||
sbi->am.age_weight = t;
|
||||
return count;
|
||||
}
|
||||
|
||||
if (!strcmp(a->attr.name, "gc_segment_mode")) {
|
||||
if (t < MAX_GC_MODE)
|
||||
sbi->gc_segment_mode = t;
|
||||
else
|
||||
return -EINVAL;
|
||||
return count;
|
||||
}
|
||||
|
||||
if (!strcmp(a->attr.name, "gc_reclaimed_segments")) {
|
||||
if (t != 0)
|
||||
return -EINVAL;
|
||||
sbi->gc_reclaimed_segs[sbi->gc_segment_mode] = 0;
|
||||
return count;
|
||||
}
|
||||
|
||||
if (!strcmp(a->attr.name, "max_fragment_chunk")) {
|
||||
if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE)
|
||||
sbi->max_fragment_chunk = t;
|
||||
else
|
||||
return -EINVAL;
|
||||
return count;
|
||||
}
|
||||
|
||||
if (!strcmp(a->attr.name, "max_fragment_hole")) {
|
||||
if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE)
|
||||
sbi->max_fragment_hole = t;
|
||||
else
|
||||
return -EINVAL;
|
||||
return count;
|
||||
}
|
||||
|
||||
*ui = (unsigned int)t;
|
||||
|
||||
@ -442,46 +655,49 @@ static void f2fs_sb_release(struct kobject *kobj)
|
||||
complete(&sbi->s_kobj_unregister);
|
||||
}
|
||||
|
||||
enum feat_id {
|
||||
FEAT_CRYPTO = 0,
|
||||
FEAT_BLKZONED,
|
||||
FEAT_ATOMIC_WRITE,
|
||||
FEAT_EXTRA_ATTR,
|
||||
FEAT_PROJECT_QUOTA,
|
||||
FEAT_INODE_CHECKSUM,
|
||||
FEAT_FLEXIBLE_INLINE_XATTR,
|
||||
FEAT_QUOTA_INO,
|
||||
FEAT_INODE_CRTIME,
|
||||
FEAT_LOST_FOUND,
|
||||
FEAT_VERITY,
|
||||
FEAT_SB_CHECKSUM,
|
||||
FEAT_CASEFOLD,
|
||||
FEAT_COMPRESSION,
|
||||
FEAT_TEST_DUMMY_ENCRYPTION_V2,
|
||||
};
|
||||
|
||||
/*
|
||||
* Note that there are three feature list entries:
|
||||
* 1) /sys/fs/f2fs/features
|
||||
* : shows runtime features supported by in-kernel f2fs along with Kconfig.
|
||||
* - ref. F2FS_FEATURE_RO_ATTR()
|
||||
*
|
||||
* 2) /sys/fs/f2fs/$s_id/features <deprecated>
|
||||
* : shows on-disk features enabled by mkfs.f2fs, used for old kernels. This
|
||||
* won't add new feature anymore, and thus, users should check entries in 3)
|
||||
* instead of this 2).
|
||||
*
|
||||
* 3) /sys/fs/f2fs/$s_id/feature_list
|
||||
* : shows on-disk features enabled by mkfs.f2fs per instance, which follows
|
||||
* sysfs entry rule where each entry should expose single value.
|
||||
* This list covers old feature list provided by 2) and beyond. Therefore,
|
||||
* please add new on-disk feature in this list only.
|
||||
* - ref. F2FS_SB_FEATURE_RO_ATTR()
|
||||
*/
|
||||
static ssize_t f2fs_feature_show(struct f2fs_attr *a,
|
||||
struct f2fs_sb_info *sbi, char *buf)
|
||||
{
|
||||
switch (a->id) {
|
||||
case FEAT_CRYPTO:
|
||||
case FEAT_BLKZONED:
|
||||
case FEAT_ATOMIC_WRITE:
|
||||
case FEAT_EXTRA_ATTR:
|
||||
case FEAT_PROJECT_QUOTA:
|
||||
case FEAT_INODE_CHECKSUM:
|
||||
case FEAT_FLEXIBLE_INLINE_XATTR:
|
||||
case FEAT_QUOTA_INO:
|
||||
case FEAT_INODE_CRTIME:
|
||||
case FEAT_LOST_FOUND:
|
||||
case FEAT_VERITY:
|
||||
case FEAT_SB_CHECKSUM:
|
||||
case FEAT_CASEFOLD:
|
||||
case FEAT_COMPRESSION:
|
||||
case FEAT_TEST_DUMMY_ENCRYPTION_V2:
|
||||
return sprintf(buf, "supported\n");
|
||||
}
|
||||
|
||||
#define F2FS_FEATURE_RO_ATTR(_name) \
|
||||
static struct f2fs_attr f2fs_attr_##_name = { \
|
||||
.attr = {.name = __stringify(_name), .mode = 0444 }, \
|
||||
.show = f2fs_feature_show, \
|
||||
}
|
||||
|
||||
static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a,
|
||||
struct f2fs_sb_info *sbi, char *buf)
|
||||
{
|
||||
if (F2FS_HAS_FEATURE(sbi, a->id))
|
||||
return sprintf(buf, "supported\n");
|
||||
}
|
||||
return 0;
|
||||
return sprintf(buf, "unsupported\n");
|
||||
}
|
||||
|
||||
#define F2FS_SB_FEATURE_RO_ATTR(_name, _feat) \
|
||||
static struct f2fs_attr f2fs_attr_sb_##_name = { \
|
||||
.attr = {.name = __stringify(_name), .mode = 0444 }, \
|
||||
.show = f2fs_sb_feature_show, \
|
||||
.id = F2FS_FEATURE_##_feat, \
|
||||
}
|
||||
|
||||
#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \
|
||||
@ -501,13 +717,6 @@ static struct f2fs_attr f2fs_attr_##_name = { \
|
||||
#define F2FS_GENERAL_RO_ATTR(name) \
|
||||
static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL)
|
||||
|
||||
#define F2FS_FEATURE_RO_ATTR(_name, _id) \
|
||||
static struct f2fs_attr f2fs_attr_##_name = { \
|
||||
.attr = {.name = __stringify(_name), .mode = 0444 }, \
|
||||
.show = f2fs_feature_show, \
|
||||
.id = _id, \
|
||||
}
|
||||
|
||||
#define F2FS_STAT_ATTR(_struct_type, _struct_name, _name, _elname) \
|
||||
static struct f2fs_attr f2fs_attr_##_name = { \
|
||||
.attr = {.name = __stringify(_name), .mode = 0444 }, \
|
||||
@ -524,8 +733,11 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
|
||||
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle, gc_mode);
|
||||
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent, gc_mode);
|
||||
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
|
||||
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, main_blkaddr, main_blkaddr);
|
||||
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards);
|
||||
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_request, max_discard_request);
|
||||
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_issue_time);
|
||||
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time);
|
||||
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time);
|
||||
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity);
|
||||
F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks);
|
||||
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
|
||||
@ -538,6 +750,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections);
|
||||
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
|
||||
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
|
||||
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
|
||||
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, max_roll_forward_node_blocks, max_rf_node_blocks);
|
||||
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
|
||||
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, migration_granularity, migration_granularity);
|
||||
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
|
||||
@ -548,9 +761,12 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, discard_idle_interval,
|
||||
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]);
|
||||
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info,
|
||||
umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]);
|
||||
#ifdef CONFIG_F2FS_IOSTAT
|
||||
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable);
|
||||
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_period_ms, iostat_period_ms);
|
||||
#endif
|
||||
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra);
|
||||
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_io_bytes, max_io_bytes);
|
||||
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold);
|
||||
F2FS_RW_ATTR(F2FS_SBI, f2fs_super_block, extension_list, extension_list);
|
||||
#ifdef CONFIG_F2FS_FAULT_INJECTION
|
||||
@ -559,14 +775,19 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
|
||||
#endif
|
||||
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag);
|
||||
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag);
|
||||
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent_high_remaining, gc_urgent_high_remaining);
|
||||
F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio);
|
||||
F2FS_GENERAL_RO_ATTR(dirty_segments);
|
||||
F2FS_GENERAL_RO_ATTR(free_segments);
|
||||
F2FS_GENERAL_RO_ATTR(ovp_segments);
|
||||
F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
|
||||
F2FS_GENERAL_RO_ATTR(features);
|
||||
F2FS_GENERAL_RO_ATTR(current_reserved_blocks);
|
||||
F2FS_GENERAL_RO_ATTR(unusable);
|
||||
F2FS_GENERAL_RO_ATTR(encoding);
|
||||
F2FS_GENERAL_RO_ATTR(mounted_time_sec);
|
||||
F2FS_GENERAL_RO_ATTR(main_blkaddr);
|
||||
F2FS_GENERAL_RO_ATTR(pending_discard);
|
||||
#ifdef CONFIG_F2FS_STAT_FS
|
||||
F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count);
|
||||
F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count);
|
||||
@ -578,28 +799,49 @@ F2FS_GENERAL_RO_ATTR(avg_vblocks);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_FS_ENCRYPTION
|
||||
F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO);
|
||||
F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2, FEAT_TEST_DUMMY_ENCRYPTION_V2);
|
||||
F2FS_FEATURE_RO_ATTR(encryption);
|
||||
F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2);
|
||||
#ifdef CONFIG_UNICODE
|
||||
F2FS_FEATURE_RO_ATTR(encrypted_casefold);
|
||||
#endif
|
||||
#endif /* CONFIG_FS_ENCRYPTION */
|
||||
#ifdef CONFIG_BLK_DEV_ZONED
|
||||
F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED);
|
||||
F2FS_FEATURE_RO_ATTR(block_zoned);
|
||||
#endif
|
||||
F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE);
|
||||
F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR);
|
||||
F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA);
|
||||
F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM);
|
||||
F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR);
|
||||
F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO);
|
||||
F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME);
|
||||
F2FS_FEATURE_RO_ATTR(lost_found, FEAT_LOST_FOUND);
|
||||
F2FS_FEATURE_RO_ATTR(atomic_write);
|
||||
F2FS_FEATURE_RO_ATTR(extra_attr);
|
||||
F2FS_FEATURE_RO_ATTR(project_quota);
|
||||
F2FS_FEATURE_RO_ATTR(inode_checksum);
|
||||
F2FS_FEATURE_RO_ATTR(flexible_inline_xattr);
|
||||
F2FS_FEATURE_RO_ATTR(quota_ino);
|
||||
F2FS_FEATURE_RO_ATTR(inode_crtime);
|
||||
F2FS_FEATURE_RO_ATTR(lost_found);
|
||||
#ifdef CONFIG_FS_VERITY
|
||||
F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY);
|
||||
F2FS_FEATURE_RO_ATTR(verity);
|
||||
#endif
|
||||
F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM);
|
||||
F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD);
|
||||
F2FS_FEATURE_RO_ATTR(sb_checksum);
|
||||
#ifdef CONFIG_UNICODE
|
||||
F2FS_FEATURE_RO_ATTR(casefold);
|
||||
#endif
|
||||
F2FS_FEATURE_RO_ATTR(readonly);
|
||||
#ifdef CONFIG_F2FS_FS_COMPRESSION
|
||||
F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION);
|
||||
F2FS_FEATURE_RO_ATTR(compression);
|
||||
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_written_block, compr_written_block);
|
||||
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_saved_block, compr_saved_block);
|
||||
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_new_inode, compr_new_inode);
|
||||
#endif
|
||||
F2FS_FEATURE_RO_ATTR(pin_file);
|
||||
|
||||
/* For ATGC */
|
||||
F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_ratio, candidate_ratio);
|
||||
F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_count, max_candidate_count);
|
||||
F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_weight, age_weight);
|
||||
F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_threshold, age_threshold);
|
||||
|
||||
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_segment_mode, gc_segment_mode);
|
||||
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_reclaimed_segments, gc_reclaimed_segs);
|
||||
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_chunk, max_fragment_chunk);
|
||||
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_hole, max_fragment_hole);
|
||||
|
||||
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
|
||||
static struct attribute *f2fs_attrs[] = {
|
||||
@ -612,7 +854,12 @@ static struct attribute *f2fs_attrs[] = {
|
||||
ATTR_LIST(reclaim_segments),
|
||||
ATTR_LIST(main_blkaddr),
|
||||
ATTR_LIST(max_small_discards),
|
||||
ATTR_LIST(max_discard_request),
|
||||
ATTR_LIST(min_discard_issue_time),
|
||||
ATTR_LIST(mid_discard_issue_time),
|
||||
ATTR_LIST(max_discard_issue_time),
|
||||
ATTR_LIST(discard_granularity),
|
||||
ATTR_LIST(pending_discard),
|
||||
ATTR_LIST(batched_trim_sections),
|
||||
ATTR_LIST(ipu_policy),
|
||||
ATTR_LIST(min_ipu_util),
|
||||
@ -626,14 +873,18 @@ static struct attribute *f2fs_attrs[] = {
|
||||
ATTR_LIST(ram_thresh),
|
||||
ATTR_LIST(ra_nid_pages),
|
||||
ATTR_LIST(dirty_nats_ratio),
|
||||
ATTR_LIST(max_roll_forward_node_blocks),
|
||||
ATTR_LIST(cp_interval),
|
||||
ATTR_LIST(idle_interval),
|
||||
ATTR_LIST(discard_idle_interval),
|
||||
ATTR_LIST(gc_idle_interval),
|
||||
ATTR_LIST(umount_discard_timeout),
|
||||
#ifdef CONFIG_F2FS_IOSTAT
|
||||
ATTR_LIST(iostat_enable),
|
||||
ATTR_LIST(iostat_period_ms),
|
||||
#endif
|
||||
ATTR_LIST(readdir_ra),
|
||||
ATTR_LIST(max_io_bytes),
|
||||
ATTR_LIST(gc_pin_file_thresh),
|
||||
ATTR_LIST(extension_list),
|
||||
#ifdef CONFIG_F2FS_FAULT_INJECTION
|
||||
@ -642,8 +893,11 @@ static struct attribute *f2fs_attrs[] = {
|
||||
#endif
|
||||
ATTR_LIST(data_io_flag),
|
||||
ATTR_LIST(node_io_flag),
|
||||
ATTR_LIST(gc_urgent_high_remaining),
|
||||
ATTR_LIST(ckpt_thread_ioprio),
|
||||
ATTR_LIST(dirty_segments),
|
||||
ATTR_LIST(free_segments),
|
||||
ATTR_LIST(ovp_segments),
|
||||
ATTR_LIST(unusable),
|
||||
ATTR_LIST(lifetime_write_kbytes),
|
||||
ATTR_LIST(features),
|
||||
@ -660,6 +914,20 @@ static struct attribute *f2fs_attrs[] = {
|
||||
ATTR_LIST(moved_blocks_background),
|
||||
ATTR_LIST(avg_vblocks),
|
||||
#endif
|
||||
#ifdef CONFIG_F2FS_FS_COMPRESSION
|
||||
ATTR_LIST(compr_written_block),
|
||||
ATTR_LIST(compr_saved_block),
|
||||
ATTR_LIST(compr_new_inode),
|
||||
#endif
|
||||
/* For ATGC */
|
||||
ATTR_LIST(atgc_candidate_ratio),
|
||||
ATTR_LIST(atgc_candidate_count),
|
||||
ATTR_LIST(atgc_age_weight),
|
||||
ATTR_LIST(atgc_age_threshold),
|
||||
ATTR_LIST(gc_segment_mode),
|
||||
ATTR_LIST(gc_reclaimed_segments),
|
||||
ATTR_LIST(max_fragment_chunk),
|
||||
ATTR_LIST(max_fragment_hole),
|
||||
NULL,
|
||||
};
|
||||
|
||||
@ -667,7 +935,10 @@ static struct attribute *f2fs_feat_attrs[] = {
|
||||
#ifdef CONFIG_FS_ENCRYPTION
|
||||
ATTR_LIST(encryption),
|
||||
ATTR_LIST(test_dummy_encryption_v2),
|
||||
#ifdef CONFIG_UNICODE
|
||||
ATTR_LIST(encrypted_casefold),
|
||||
#endif
|
||||
#endif /* CONFIG_FS_ENCRYPTION */
|
||||
#ifdef CONFIG_BLK_DEV_ZONED
|
||||
ATTR_LIST(block_zoned),
|
||||
#endif
|
||||
@ -683,10 +954,53 @@ static struct attribute *f2fs_feat_attrs[] = {
|
||||
ATTR_LIST(verity),
|
||||
#endif
|
||||
ATTR_LIST(sb_checksum),
|
||||
#ifdef CONFIG_UNICODE
|
||||
ATTR_LIST(casefold),
|
||||
#endif
|
||||
ATTR_LIST(readonly),
|
||||
#ifdef CONFIG_F2FS_FS_COMPRESSION
|
||||
ATTR_LIST(compression),
|
||||
#endif
|
||||
ATTR_LIST(pin_file),
|
||||
NULL,
|
||||
};
|
||||
|
||||
F2FS_GENERAL_RO_ATTR(sb_status);
|
||||
static struct attribute *f2fs_stat_attrs[] = {
|
||||
ATTR_LIST(sb_status),
|
||||
NULL,
|
||||
};
|
||||
|
||||
F2FS_SB_FEATURE_RO_ATTR(encryption, ENCRYPT);
|
||||
F2FS_SB_FEATURE_RO_ATTR(block_zoned, BLKZONED);
|
||||
F2FS_SB_FEATURE_RO_ATTR(extra_attr, EXTRA_ATTR);
|
||||
F2FS_SB_FEATURE_RO_ATTR(project_quota, PRJQUOTA);
|
||||
F2FS_SB_FEATURE_RO_ATTR(inode_checksum, INODE_CHKSUM);
|
||||
F2FS_SB_FEATURE_RO_ATTR(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR);
|
||||
F2FS_SB_FEATURE_RO_ATTR(quota_ino, QUOTA_INO);
|
||||
F2FS_SB_FEATURE_RO_ATTR(inode_crtime, INODE_CRTIME);
|
||||
F2FS_SB_FEATURE_RO_ATTR(lost_found, LOST_FOUND);
|
||||
F2FS_SB_FEATURE_RO_ATTR(verity, VERITY);
|
||||
F2FS_SB_FEATURE_RO_ATTR(sb_checksum, SB_CHKSUM);
|
||||
F2FS_SB_FEATURE_RO_ATTR(casefold, CASEFOLD);
|
||||
F2FS_SB_FEATURE_RO_ATTR(compression, COMPRESSION);
|
||||
F2FS_SB_FEATURE_RO_ATTR(readonly, RO);
|
||||
|
||||
static struct attribute *f2fs_sb_feat_attrs[] = {
|
||||
ATTR_LIST(sb_encryption),
|
||||
ATTR_LIST(sb_block_zoned),
|
||||
ATTR_LIST(sb_extra_attr),
|
||||
ATTR_LIST(sb_project_quota),
|
||||
ATTR_LIST(sb_inode_checksum),
|
||||
ATTR_LIST(sb_flexible_inline_xattr),
|
||||
ATTR_LIST(sb_quota_ino),
|
||||
ATTR_LIST(sb_inode_crtime),
|
||||
ATTR_LIST(sb_lost_found),
|
||||
ATTR_LIST(sb_verity),
|
||||
ATTR_LIST(sb_sb_checksum),
|
||||
ATTR_LIST(sb_casefold),
|
||||
ATTR_LIST(sb_compression),
|
||||
ATTR_LIST(sb_readonly),
|
||||
NULL,
|
||||
};
|
||||
|
||||
@ -706,7 +1020,7 @@ static struct kobj_type f2fs_ktype = {
|
||||
};
|
||||
|
||||
static struct kset f2fs_kset = {
|
||||
.kobj = {.ktype = &f2fs_ktype},
|
||||
.kobj = {.ktype = &f2fs_ktype},
|
||||
};
|
||||
|
||||
static struct kobj_type f2fs_feat_ktype = {
|
||||
@ -718,6 +1032,71 @@ static struct kobject f2fs_feat = {
|
||||
.kset = &f2fs_kset,
|
||||
};
|
||||
|
||||
static ssize_t f2fs_stat_attr_show(struct kobject *kobj,
|
||||
struct attribute *attr, char *buf)
|
||||
{
|
||||
struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
|
||||
s_stat_kobj);
|
||||
struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
|
||||
|
||||
return a->show ? a->show(a, sbi, buf) : 0;
|
||||
}
|
||||
|
||||
static ssize_t f2fs_stat_attr_store(struct kobject *kobj, struct attribute *attr,
|
||||
const char *buf, size_t len)
|
||||
{
|
||||
struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
|
||||
s_stat_kobj);
|
||||
struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
|
||||
|
||||
return a->store ? a->store(a, sbi, buf, len) : 0;
|
||||
}
|
||||
|
||||
static void f2fs_stat_kobj_release(struct kobject *kobj)
|
||||
{
|
||||
struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
|
||||
s_stat_kobj);
|
||||
complete(&sbi->s_stat_kobj_unregister);
|
||||
}
|
||||
|
||||
static const struct sysfs_ops f2fs_stat_attr_ops = {
|
||||
.show = f2fs_stat_attr_show,
|
||||
.store = f2fs_stat_attr_store,
|
||||
};
|
||||
|
||||
static struct kobj_type f2fs_stat_ktype = {
|
||||
.default_attrs = f2fs_stat_attrs,
|
||||
.sysfs_ops = &f2fs_stat_attr_ops,
|
||||
.release = f2fs_stat_kobj_release,
|
||||
};
|
||||
|
||||
static ssize_t f2fs_sb_feat_attr_show(struct kobject *kobj,
|
||||
struct attribute *attr, char *buf)
|
||||
{
|
||||
struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
|
||||
s_feature_list_kobj);
|
||||
struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
|
||||
|
||||
return a->show ? a->show(a, sbi, buf) : 0;
|
||||
}
|
||||
|
||||
static void f2fs_feature_list_kobj_release(struct kobject *kobj)
|
||||
{
|
||||
struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
|
||||
s_feature_list_kobj);
|
||||
complete(&sbi->s_feature_list_kobj_unregister);
|
||||
}
|
||||
|
||||
static const struct sysfs_ops f2fs_feature_list_attr_ops = {
|
||||
.show = f2fs_sb_feat_attr_show,
|
||||
};
|
||||
|
||||
static struct kobj_type f2fs_feature_list_ktype = {
|
||||
.default_attrs = f2fs_sb_feat_attrs,
|
||||
.sysfs_ops = &f2fs_feature_list_attr_ops,
|
||||
.release = f2fs_feature_list_kobj_release,
|
||||
};
|
||||
|
||||
static int __maybe_unused segment_info_seq_show(struct seq_file *seq,
|
||||
void *offset)
|
||||
{
|
||||
@ -769,101 +1148,6 @@ static int __maybe_unused segment_bits_seq_show(struct seq_file *seq,
|
||||
return 0;
|
||||
}
|
||||
|
||||
void f2fs_record_iostat(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
unsigned long long iostat_diff[NR_IO_TYPE];
|
||||
int i;
|
||||
|
||||
if (time_is_after_jiffies(sbi->iostat_next_period))
|
||||
return;
|
||||
|
||||
/* Need double check under the lock */
|
||||
spin_lock(&sbi->iostat_lock);
|
||||
if (time_is_after_jiffies(sbi->iostat_next_period)) {
|
||||
spin_unlock(&sbi->iostat_lock);
|
||||
return;
|
||||
}
|
||||
sbi->iostat_next_period = jiffies +
|
||||
msecs_to_jiffies(sbi->iostat_period_ms);
|
||||
|
||||
for (i = 0; i < NR_IO_TYPE; i++) {
|
||||
iostat_diff[i] = sbi->rw_iostat[i] -
|
||||
sbi->prev_rw_iostat[i];
|
||||
sbi->prev_rw_iostat[i] = sbi->rw_iostat[i];
|
||||
}
|
||||
spin_unlock(&sbi->iostat_lock);
|
||||
|
||||
trace_f2fs_iostat(sbi, iostat_diff);
|
||||
}
|
||||
|
||||
static int __maybe_unused iostat_info_seq_show(struct seq_file *seq,
|
||||
void *offset)
|
||||
{
|
||||
struct super_block *sb = seq->private;
|
||||
struct f2fs_sb_info *sbi = F2FS_SB(sb);
|
||||
time64_t now = ktime_get_real_seconds();
|
||||
|
||||
if (!sbi->iostat_enable)
|
||||
return 0;
|
||||
|
||||
seq_printf(seq, "time: %-16llu\n", now);
|
||||
|
||||
/* print app write IOs */
|
||||
seq_puts(seq, "[WRITE]\n");
|
||||
seq_printf(seq, "app buffered: %-16llu\n",
|
||||
sbi->rw_iostat[APP_BUFFERED_IO]);
|
||||
seq_printf(seq, "app direct: %-16llu\n",
|
||||
sbi->rw_iostat[APP_DIRECT_IO]);
|
||||
seq_printf(seq, "app mapped: %-16llu\n",
|
||||
sbi->rw_iostat[APP_MAPPED_IO]);
|
||||
|
||||
/* print fs write IOs */
|
||||
seq_printf(seq, "fs data: %-16llu\n",
|
||||
sbi->rw_iostat[FS_DATA_IO]);
|
||||
seq_printf(seq, "fs node: %-16llu\n",
|
||||
sbi->rw_iostat[FS_NODE_IO]);
|
||||
seq_printf(seq, "fs meta: %-16llu\n",
|
||||
sbi->rw_iostat[FS_META_IO]);
|
||||
seq_printf(seq, "fs gc data: %-16llu\n",
|
||||
sbi->rw_iostat[FS_GC_DATA_IO]);
|
||||
seq_printf(seq, "fs gc node: %-16llu\n",
|
||||
sbi->rw_iostat[FS_GC_NODE_IO]);
|
||||
seq_printf(seq, "fs cp data: %-16llu\n",
|
||||
sbi->rw_iostat[FS_CP_DATA_IO]);
|
||||
seq_printf(seq, "fs cp node: %-16llu\n",
|
||||
sbi->rw_iostat[FS_CP_NODE_IO]);
|
||||
seq_printf(seq, "fs cp meta: %-16llu\n",
|
||||
sbi->rw_iostat[FS_CP_META_IO]);
|
||||
|
||||
/* print app read IOs */
|
||||
seq_puts(seq, "[READ]\n");
|
||||
seq_printf(seq, "app buffered: %-16llu\n",
|
||||
sbi->rw_iostat[APP_BUFFERED_READ_IO]);
|
||||
seq_printf(seq, "app direct: %-16llu\n",
|
||||
sbi->rw_iostat[APP_DIRECT_READ_IO]);
|
||||
seq_printf(seq, "app mapped: %-16llu\n",
|
||||
sbi->rw_iostat[APP_MAPPED_READ_IO]);
|
||||
|
||||
/* print fs read IOs */
|
||||
seq_printf(seq, "fs data: %-16llu\n",
|
||||
sbi->rw_iostat[FS_DATA_READ_IO]);
|
||||
seq_printf(seq, "fs gc data: %-16llu\n",
|
||||
sbi->rw_iostat[FS_GDATA_READ_IO]);
|
||||
seq_printf(seq, "fs compr_data: %-16llu\n",
|
||||
sbi->rw_iostat[FS_CDATA_READ_IO]);
|
||||
seq_printf(seq, "fs node: %-16llu\n",
|
||||
sbi->rw_iostat[FS_NODE_READ_IO]);
|
||||
seq_printf(seq, "fs meta: %-16llu\n",
|
||||
sbi->rw_iostat[FS_META_READ_IO]);
|
||||
|
||||
/* print other IOs */
|
||||
seq_puts(seq, "[OTHER]\n");
|
||||
seq_printf(seq, "fs discard: %-16llu\n",
|
||||
sbi->rw_iostat[FS_DISCARD]);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __maybe_unused victim_bits_seq_show(struct seq_file *seq,
|
||||
void *offset)
|
||||
{
|
||||
@ -901,7 +1185,9 @@ static const struct file_operations f2fs_seq_##_name##_fops = { \
|
||||
|
||||
F2FS_PROC_FILE_DEF(segment_info);
|
||||
F2FS_PROC_FILE_DEF(segment_bits);
|
||||
#ifdef CONFIG_F2FS_IOSTAT
|
||||
F2FS_PROC_FILE_DEF(iostat_info);
|
||||
#endif
|
||||
F2FS_PROC_FILE_DEF(victim_bits);
|
||||
|
||||
int __init f2fs_init_sysfs(void)
|
||||
@ -942,37 +1228,71 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
|
||||
init_completion(&sbi->s_kobj_unregister);
|
||||
err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL,
|
||||
"%s", sb->s_id);
|
||||
if (err) {
|
||||
kobject_put(&sbi->s_kobj);
|
||||
wait_for_completion(&sbi->s_kobj_unregister);
|
||||
return err;
|
||||
}
|
||||
if (err)
|
||||
goto put_sb_kobj;
|
||||
|
||||
sbi->s_stat_kobj.kset = &f2fs_kset;
|
||||
init_completion(&sbi->s_stat_kobj_unregister);
|
||||
err = kobject_init_and_add(&sbi->s_stat_kobj, &f2fs_stat_ktype,
|
||||
&sbi->s_kobj, "stat");
|
||||
if (err)
|
||||
goto put_stat_kobj;
|
||||
|
||||
sbi->s_feature_list_kobj.kset = &f2fs_kset;
|
||||
init_completion(&sbi->s_feature_list_kobj_unregister);
|
||||
err = kobject_init_and_add(&sbi->s_feature_list_kobj,
|
||||
&f2fs_feature_list_ktype,
|
||||
&sbi->s_kobj, "feature_list");
|
||||
if (err)
|
||||
goto put_feature_list_kobj;
|
||||
|
||||
if (f2fs_proc_root)
|
||||
sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
|
||||
|
||||
if (sbi->s_proc) {
|
||||
proc_create_data("segment_info", S_IRUGO, sbi->s_proc,
|
||||
&f2fs_seq_segment_info_fops, sb);
|
||||
proc_create_data("segment_bits", S_IRUGO, sbi->s_proc,
|
||||
&f2fs_seq_segment_bits_fops, sb);
|
||||
proc_create_data("iostat_info", S_IRUGO, sbi->s_proc,
|
||||
proc_create_data("segment_info", 0444, sbi->s_proc,
|
||||
&f2fs_seq_segment_info_fops, sb);
|
||||
proc_create_data("segment_bits", 0444, sbi->s_proc,
|
||||
&f2fs_seq_segment_bits_fops, sb);
|
||||
#ifdef CONFIG_F2FS_IOSTAT
|
||||
proc_create_data("iostat_info", 0444, sbi->s_proc,
|
||||
&f2fs_seq_iostat_info_fops, sb);
|
||||
proc_create_data("victim_bits", S_IRUGO, sbi->s_proc,
|
||||
#endif
|
||||
proc_create_data("victim_bits", 0444, sbi->s_proc,
|
||||
&f2fs_seq_victim_bits_fops, sb);
|
||||
}
|
||||
return 0;
|
||||
put_feature_list_kobj:
|
||||
kobject_put(&sbi->s_feature_list_kobj);
|
||||
wait_for_completion(&sbi->s_feature_list_kobj_unregister);
|
||||
put_stat_kobj:
|
||||
kobject_put(&sbi->s_stat_kobj);
|
||||
wait_for_completion(&sbi->s_stat_kobj_unregister);
|
||||
put_sb_kobj:
|
||||
kobject_put(&sbi->s_kobj);
|
||||
wait_for_completion(&sbi->s_kobj_unregister);
|
||||
return err;
|
||||
}
|
||||
|
||||
void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi)
|
||||
{
|
||||
if (sbi->s_proc) {
|
||||
#ifdef CONFIG_F2FS_IOSTAT
|
||||
remove_proc_entry("iostat_info", sbi->s_proc);
|
||||
#endif
|
||||
remove_proc_entry("segment_info", sbi->s_proc);
|
||||
remove_proc_entry("segment_bits", sbi->s_proc);
|
||||
remove_proc_entry("victim_bits", sbi->s_proc);
|
||||
remove_proc_entry(sbi->sb->s_id, f2fs_proc_root);
|
||||
}
|
||||
|
||||
kobject_del(&sbi->s_stat_kobj);
|
||||
kobject_put(&sbi->s_stat_kobj);
|
||||
wait_for_completion(&sbi->s_stat_kobj_unregister);
|
||||
kobject_del(&sbi->s_feature_list_kobj);
|
||||
kobject_put(&sbi->s_feature_list_kobj);
|
||||
wait_for_completion(&sbi->s_feature_list_kobj_unregister);
|
||||
|
||||
kobject_del(&sbi->s_kobj);
|
||||
kobject_put(&sbi->s_kobj);
|
||||
wait_for_completion(&sbi->s_kobj_unregister);
|
||||
|
165
fs/f2fs/trace.c
165
fs/f2fs/trace.c
@ -1,165 +0,0 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* f2fs IO tracer
|
||||
*
|
||||
* Copyright (c) 2014 Motorola Mobility
|
||||
* Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
|
||||
*/
|
||||
#include <linux/fs.h>
|
||||
#include <linux/f2fs_fs.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/radix-tree.h>
|
||||
|
||||
#include "f2fs.h"
|
||||
#include "trace.h"
|
||||
|
||||
static RADIX_TREE(pids, GFP_ATOMIC);
|
||||
static spinlock_t pids_lock;
|
||||
static struct last_io_info last_io;
|
||||
|
||||
static inline void __print_last_io(void)
|
||||
{
|
||||
if (!last_io.len)
|
||||
return;
|
||||
|
||||
trace_printk("%3x:%3x %4x %-16s %2x %5x %5x %12x %4x\n",
|
||||
last_io.major, last_io.minor,
|
||||
last_io.pid, "----------------",
|
||||
last_io.type,
|
||||
last_io.fio.op, last_io.fio.op_flags,
|
||||
last_io.fio.new_blkaddr,
|
||||
last_io.len);
|
||||
memset(&last_io, 0, sizeof(last_io));
|
||||
}
|
||||
|
||||
static int __file_type(struct inode *inode, pid_t pid)
|
||||
{
|
||||
if (f2fs_is_atomic_file(inode))
|
||||
return __ATOMIC_FILE;
|
||||
else if (f2fs_is_volatile_file(inode))
|
||||
return __VOLATILE_FILE;
|
||||
else if (S_ISDIR(inode->i_mode))
|
||||
return __DIR_FILE;
|
||||
else if (inode->i_ino == F2FS_NODE_INO(F2FS_I_SB(inode)))
|
||||
return __NODE_FILE;
|
||||
else if (inode->i_ino == F2FS_META_INO(F2FS_I_SB(inode)))
|
||||
return __META_FILE;
|
||||
else if (pid)
|
||||
return __NORMAL_FILE;
|
||||
else
|
||||
return __MISC_FILE;
|
||||
}
|
||||
|
||||
void f2fs_trace_pid(struct page *page)
|
||||
{
|
||||
struct inode *inode = page->mapping->host;
|
||||
pid_t pid = task_pid_nr(current);
|
||||
void *p;
|
||||
|
||||
set_page_private(page, (unsigned long)pid);
|
||||
|
||||
retry:
|
||||
if (radix_tree_preload(GFP_NOFS))
|
||||
return;
|
||||
|
||||
spin_lock(&pids_lock);
|
||||
p = radix_tree_lookup(&pids, pid);
|
||||
if (p == current)
|
||||
goto out;
|
||||
if (p)
|
||||
radix_tree_delete(&pids, pid);
|
||||
|
||||
if (radix_tree_insert(&pids, pid, current)) {
|
||||
spin_unlock(&pids_lock);
|
||||
radix_tree_preload_end();
|
||||
cond_resched();
|
||||
goto retry;
|
||||
}
|
||||
|
||||
trace_printk("%3x:%3x %4x %-16s\n",
|
||||
MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
|
||||
pid, current->comm);
|
||||
out:
|
||||
spin_unlock(&pids_lock);
|
||||
radix_tree_preload_end();
|
||||
}
|
||||
|
||||
void f2fs_trace_ios(struct f2fs_io_info *fio, int flush)
|
||||
{
|
||||
struct inode *inode;
|
||||
pid_t pid;
|
||||
int major, minor;
|
||||
|
||||
if (flush) {
|
||||
__print_last_io();
|
||||
return;
|
||||
}
|
||||
|
||||
inode = fio->page->mapping->host;
|
||||
pid = page_private(fio->page);
|
||||
|
||||
major = MAJOR(inode->i_sb->s_dev);
|
||||
minor = MINOR(inode->i_sb->s_dev);
|
||||
|
||||
if (last_io.major == major && last_io.minor == minor &&
|
||||
last_io.pid == pid &&
|
||||
last_io.type == __file_type(inode, pid) &&
|
||||
last_io.fio.op == fio->op &&
|
||||
last_io.fio.op_flags == fio->op_flags &&
|
||||
last_io.fio.new_blkaddr + last_io.len ==
|
||||
fio->new_blkaddr) {
|
||||
last_io.len++;
|
||||
return;
|
||||
}
|
||||
|
||||
__print_last_io();
|
||||
|
||||
last_io.major = major;
|
||||
last_io.minor = minor;
|
||||
last_io.pid = pid;
|
||||
last_io.type = __file_type(inode, pid);
|
||||
last_io.fio = *fio;
|
||||
last_io.len = 1;
|
||||
return;
|
||||
}
|
||||
|
||||
void f2fs_build_trace_ios(void)
|
||||
{
|
||||
spin_lock_init(&pids_lock);
|
||||
}
|
||||
|
||||
#define PIDVEC_SIZE 128
|
||||
static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index,
|
||||
unsigned int max_items)
|
||||
{
|
||||
struct radix_tree_iter iter;
|
||||
void **slot;
|
||||
unsigned int ret = 0;
|
||||
|
||||
if (unlikely(!max_items))
|
||||
return 0;
|
||||
|
||||
radix_tree_for_each_slot(slot, &pids, &iter, first_index) {
|
||||
results[ret] = iter.index;
|
||||
if (++ret == max_items)
|
||||
break;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
void f2fs_destroy_trace_ios(void)
|
||||
{
|
||||
pid_t pid[PIDVEC_SIZE];
|
||||
pid_t next_pid = 0;
|
||||
unsigned int found;
|
||||
|
||||
spin_lock(&pids_lock);
|
||||
while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) {
|
||||
unsigned idx;
|
||||
|
||||
next_pid = pid[found - 1] + 1;
|
||||
for (idx = 0; idx < found; idx++)
|
||||
radix_tree_delete(&pids, pid[idx]);
|
||||
}
|
||||
spin_unlock(&pids_lock);
|
||||
}
|
@ -1,43 +0,0 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* f2fs IO tracer
|
||||
*
|
||||
* Copyright (c) 2014 Motorola Mobility
|
||||
* Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
|
||||
*/
|
||||
#ifndef __F2FS_TRACE_H__
|
||||
#define __F2FS_TRACE_H__
|
||||
|
||||
#ifdef CONFIG_F2FS_IO_TRACE
|
||||
#include <trace/events/f2fs.h>
|
||||
|
||||
enum file_type {
|
||||
__NORMAL_FILE,
|
||||
__DIR_FILE,
|
||||
__NODE_FILE,
|
||||
__META_FILE,
|
||||
__ATOMIC_FILE,
|
||||
__VOLATILE_FILE,
|
||||
__MISC_FILE,
|
||||
};
|
||||
|
||||
struct last_io_info {
|
||||
int major, minor;
|
||||
pid_t pid;
|
||||
enum file_type type;
|
||||
struct f2fs_io_info fio;
|
||||
block_t len;
|
||||
};
|
||||
|
||||
extern void f2fs_trace_pid(struct page *);
|
||||
extern void f2fs_trace_ios(struct f2fs_io_info *, int);
|
||||
extern void f2fs_build_trace_ios(void);
|
||||
extern void f2fs_destroy_trace_ios(void);
|
||||
#else
|
||||
#define f2fs_trace_pid(p)
|
||||
#define f2fs_trace_ios(i, n)
|
||||
#define f2fs_build_trace_ios()
|
||||
#define f2fs_destroy_trace_ios()
|
||||
|
||||
#endif
|
||||
#endif /* __F2FS_TRACE_H__ */
|
@ -29,6 +29,8 @@
|
||||
#include "f2fs.h"
|
||||
#include "xattr.h"
|
||||
|
||||
#define F2FS_VERIFY_VER (1)
|
||||
|
||||
static inline loff_t f2fs_verity_metadata_pos(const struct inode *inode)
|
||||
{
|
||||
return round_up(inode->i_size, 65536);
|
||||
@ -134,7 +136,7 @@ static int f2fs_begin_enable_verity(struct file *filp)
|
||||
* here and not rely on ->open() doing it. This must be done before
|
||||
* evicting the inline data.
|
||||
*/
|
||||
err = dquot_initialize(inode);
|
||||
err = f2fs_dquot_initialize(inode);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
@ -150,40 +152,73 @@ static int f2fs_end_enable_verity(struct file *filp, const void *desc,
|
||||
size_t desc_size, u64 merkle_tree_size)
|
||||
{
|
||||
struct inode *inode = file_inode(filp);
|
||||
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
|
||||
u64 desc_pos = f2fs_verity_metadata_pos(inode) + merkle_tree_size;
|
||||
struct fsverity_descriptor_location dloc = {
|
||||
.version = cpu_to_le32(1),
|
||||
.version = cpu_to_le32(F2FS_VERIFY_VER),
|
||||
.size = cpu_to_le32(desc_size),
|
||||
.pos = cpu_to_le64(desc_pos),
|
||||
};
|
||||
int err = 0;
|
||||
int err = 0, err2 = 0;
|
||||
|
||||
if (desc != NULL) {
|
||||
/* Succeeded; write the verity descriptor. */
|
||||
err = pagecache_write(inode, desc, desc_size, desc_pos);
|
||||
/*
|
||||
* If an error already occurred (which fs/verity/ signals by passing
|
||||
* desc == NULL), then only clean-up is needed.
|
||||
*/
|
||||
if (desc == NULL)
|
||||
goto cleanup;
|
||||
|
||||
/* Write all pages before clearing FI_VERITY_IN_PROGRESS. */
|
||||
if (!err)
|
||||
err = filemap_write_and_wait(inode->i_mapping);
|
||||
}
|
||||
/* Append the verity descriptor. */
|
||||
err = pagecache_write(inode, desc, desc_size, desc_pos);
|
||||
if (err)
|
||||
goto cleanup;
|
||||
|
||||
/* If we failed, truncate anything we wrote past i_size. */
|
||||
if (desc == NULL || err)
|
||||
f2fs_truncate(inode);
|
||||
/*
|
||||
* Write all pages (both data and verity metadata). Note that this must
|
||||
* happen before clearing FI_VERITY_IN_PROGRESS; otherwise pages beyond
|
||||
* i_size won't be written properly. For crash consistency, this also
|
||||
* must happen before the verity inode flag gets persisted.
|
||||
*/
|
||||
err = filemap_write_and_wait(inode->i_mapping);
|
||||
if (err)
|
||||
goto cleanup;
|
||||
|
||||
/* Set the verity xattr. */
|
||||
err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY,
|
||||
F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc),
|
||||
NULL, XATTR_CREATE);
|
||||
if (err)
|
||||
goto cleanup;
|
||||
|
||||
/* Finally, set the verity inode flag. */
|
||||
file_set_verity(inode);
|
||||
f2fs_set_inode_flags(inode);
|
||||
f2fs_mark_inode_dirty_sync(inode, true);
|
||||
|
||||
clear_inode_flag(inode, FI_VERITY_IN_PROGRESS);
|
||||
return 0;
|
||||
|
||||
if (desc != NULL && !err) {
|
||||
err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY,
|
||||
F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc),
|
||||
NULL, XATTR_CREATE);
|
||||
if (!err) {
|
||||
file_set_verity(inode);
|
||||
f2fs_set_inode_flags(inode);
|
||||
f2fs_mark_inode_dirty_sync(inode, true);
|
||||
}
|
||||
cleanup:
|
||||
/*
|
||||
* Verity failed to be enabled, so clean up by truncating any verity
|
||||
* metadata that was written beyond i_size (both from cache and from
|
||||
* disk) and clearing FI_VERITY_IN_PROGRESS.
|
||||
*
|
||||
* Taking i_gc_rwsem[WRITE] is needed to stop f2fs garbage collection
|
||||
* from re-instantiating cached pages we are truncating (since unlike
|
||||
* normal file accesses, garbage collection isn't limited by i_size).
|
||||
*/
|
||||
f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
|
||||
truncate_inode_pages(inode->i_mapping, inode->i_size);
|
||||
err2 = f2fs_truncate(inode);
|
||||
if (err2) {
|
||||
f2fs_err(sbi, "Truncating verity metadata failed (errno=%d)",
|
||||
err2);
|
||||
set_sbi_flag(sbi, SBI_NEED_FSCK);
|
||||
}
|
||||
return err;
|
||||
f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
|
||||
clear_inode_flag(inode, FI_VERITY_IN_PROGRESS);
|
||||
return err ?: err2;
|
||||
}
|
||||
|
||||
static int f2fs_get_verity_descriptor(struct inode *inode, void *buf,
|
||||
@ -199,7 +234,7 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf,
|
||||
F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), NULL);
|
||||
if (res < 0 && res != -ERANGE)
|
||||
return res;
|
||||
if (res != sizeof(dloc) || dloc.version != cpu_to_le32(1)) {
|
||||
if (res != sizeof(dloc) || dloc.version != cpu_to_le32(F2FS_VERIFY_VER)) {
|
||||
f2fs_warn(F2FS_I_SB(inode), "unknown verity xattr format");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
@ -27,7 +27,8 @@ static void *xattr_alloc(struct f2fs_sb_info *sbi, int size, bool *is_inline)
|
||||
{
|
||||
if (likely(size == sbi->inline_xattr_slab_size)) {
|
||||
*is_inline = true;
|
||||
return kmem_cache_zalloc(sbi->inline_xattr_slab, GFP_NOFS);
|
||||
return f2fs_kmem_cache_alloc(sbi->inline_xattr_slab,
|
||||
GFP_F2FS_ZERO, false, sbi);
|
||||
}
|
||||
*is_inline = false;
|
||||
return f2fs_kzalloc(sbi, size, GFP_NOFS);
|
||||
@ -39,7 +40,7 @@ static void xattr_free(struct f2fs_sb_info *sbi, void *xattr_addr,
|
||||
if (is_inline)
|
||||
kmem_cache_free(sbi->inline_xattr_slab, xattr_addr);
|
||||
else
|
||||
kvfree(xattr_addr);
|
||||
kfree(xattr_addr);
|
||||
}
|
||||
|
||||
static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
|
||||
@ -175,8 +176,8 @@ const struct xattr_handler f2fs_xattr_trusted_handler = {
|
||||
const struct xattr_handler f2fs_xattr_advise_handler = {
|
||||
.name = F2FS_SYSTEM_ADVISE_NAME,
|
||||
.flags = F2FS_XATTR_INDEX_ADVISE,
|
||||
.get = f2fs_xattr_advise_get,
|
||||
.set = f2fs_xattr_advise_set,
|
||||
.get = f2fs_xattr_advise_get,
|
||||
.set = f2fs_xattr_advise_set,
|
||||
};
|
||||
|
||||
const struct xattr_handler f2fs_xattr_security_handler = {
|
||||
@ -223,15 +224,18 @@ static inline const struct xattr_handler *f2fs_xattr_handler(int index)
|
||||
}
|
||||
|
||||
static struct f2fs_xattr_entry *__find_xattr(void *base_addr,
|
||||
void *last_base_addr, int index,
|
||||
size_t len, const char *name)
|
||||
void *last_base_addr, void **last_addr,
|
||||
int index, size_t len, const char *name)
|
||||
{
|
||||
struct f2fs_xattr_entry *entry;
|
||||
|
||||
list_for_each_xattr(entry, base_addr) {
|
||||
if ((void *)(entry) + sizeof(__u32) > last_base_addr ||
|
||||
(void *)XATTR_NEXT_ENTRY(entry) > last_base_addr)
|
||||
(void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) {
|
||||
if (last_addr)
|
||||
*last_addr = entry;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (entry->e_name_index != index)
|
||||
continue;
|
||||
@ -251,19 +255,9 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode,
|
||||
unsigned int inline_size = inline_xattr_size(inode);
|
||||
void *max_addr = base_addr + inline_size;
|
||||
|
||||
list_for_each_xattr(entry, base_addr) {
|
||||
if ((void *)entry + sizeof(__u32) > max_addr ||
|
||||
(void *)XATTR_NEXT_ENTRY(entry) > max_addr) {
|
||||
*last_addr = entry;
|
||||
return NULL;
|
||||
}
|
||||
if (entry->e_name_index != index)
|
||||
continue;
|
||||
if (entry->e_name_len != len)
|
||||
continue;
|
||||
if (!memcmp(entry->e_name, name, len))
|
||||
break;
|
||||
}
|
||||
entry = __find_xattr(base_addr, max_addr, last_addr, index, len, name);
|
||||
if (!entry)
|
||||
return NULL;
|
||||
|
||||
/* inline xattr header or entry across max inline xattr size */
|
||||
if (IS_XATTR_LAST_ENTRY(entry) &&
|
||||
@ -327,7 +321,7 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
|
||||
void *last_addr = NULL;
|
||||
nid_t xnid = F2FS_I(inode)->i_xattr_nid;
|
||||
unsigned int inline_size = inline_xattr_size(inode);
|
||||
int err = 0;
|
||||
int err;
|
||||
|
||||
if (!xnid && !inline_size)
|
||||
return -ENODATA;
|
||||
@ -365,7 +359,7 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
|
||||
else
|
||||
cur_addr = txattr_addr;
|
||||
|
||||
*xe = __find_xattr(cur_addr, last_txattr_addr, index, len, name);
|
||||
*xe = __find_xattr(cur_addr, last_txattr_addr, NULL, index, len, name);
|
||||
if (!*xe) {
|
||||
f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr",
|
||||
inode->i_ino);
|
||||
@ -425,7 +419,7 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage,
|
||||
*base_addr = txattr_addr;
|
||||
return 0;
|
||||
fail:
|
||||
kvfree(txattr_addr);
|
||||
kfree(txattr_addr);
|
||||
return err;
|
||||
}
|
||||
|
||||
@ -486,6 +480,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
|
||||
f2fs_wait_on_page_writeback(xpage, NODE, true, true);
|
||||
} else {
|
||||
struct dnode_of_data dn;
|
||||
|
||||
set_new_dnode(&dn, inode, NULL, NULL, new_nid);
|
||||
xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET);
|
||||
if (IS_ERR(xpage)) {
|
||||
@ -515,7 +510,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
|
||||
void *buffer, size_t buffer_size, struct page *ipage)
|
||||
{
|
||||
struct f2fs_xattr_entry *entry = NULL;
|
||||
int error = 0;
|
||||
int error;
|
||||
unsigned int size, len;
|
||||
void *base_addr = NULL;
|
||||
int base_size;
|
||||
@ -528,10 +523,10 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
|
||||
if (len > F2FS_NAME_LEN)
|
||||
return -ERANGE;
|
||||
|
||||
down_read(&F2FS_I(inode)->i_xattr_sem);
|
||||
f2fs_down_read(&F2FS_I(inode)->i_xattr_sem);
|
||||
error = lookup_all_xattrs(inode, ipage, index, len, name,
|
||||
&entry, &base_addr, &base_size, &is_inline);
|
||||
up_read(&F2FS_I(inode)->i_xattr_sem);
|
||||
f2fs_up_read(&F2FS_I(inode)->i_xattr_sem);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
@ -562,12 +557,12 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
|
||||
struct inode *inode = d_inode(dentry);
|
||||
struct f2fs_xattr_entry *entry;
|
||||
void *base_addr, *last_base_addr;
|
||||
int error = 0;
|
||||
int error;
|
||||
size_t rest = buffer_size;
|
||||
|
||||
down_read(&F2FS_I(inode)->i_xattr_sem);
|
||||
f2fs_down_read(&F2FS_I(inode)->i_xattr_sem);
|
||||
error = read_all_xattrs(inode, NULL, &base_addr);
|
||||
up_read(&F2FS_I(inode)->i_xattr_sem);
|
||||
f2fs_up_read(&F2FS_I(inode)->i_xattr_sem);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
@ -610,7 +605,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
|
||||
}
|
||||
error = buffer_size - rest;
|
||||
cleanup:
|
||||
kvfree(base_addr);
|
||||
kfree(base_addr);
|
||||
return error;
|
||||
}
|
||||
|
||||
@ -632,7 +627,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
|
||||
int found, newsize;
|
||||
size_t len;
|
||||
__u32 new_hsize;
|
||||
int error = 0;
|
||||
int error;
|
||||
|
||||
if (name == NULL)
|
||||
return -EINVAL;
|
||||
@ -655,7 +650,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
|
||||
last_base_addr = (void *)base_addr + XATTR_SIZE(inode);
|
||||
|
||||
/* find entry with wanted name. */
|
||||
here = __find_xattr(base_addr, last_base_addr, index, len, name);
|
||||
here = __find_xattr(base_addr, last_base_addr, NULL, index, len, name);
|
||||
if (!here) {
|
||||
f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr",
|
||||
inode->i_ino);
|
||||
@ -673,7 +668,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
|
||||
}
|
||||
|
||||
if (value && f2fs_xattr_value_same(here, value, size))
|
||||
goto exit;
|
||||
goto same;
|
||||
} else if ((flags & XATTR_REPLACE)) {
|
||||
error = -ENODATA;
|
||||
goto exit;
|
||||
@ -683,6 +678,8 @@ static int __f2fs_setxattr(struct inode *inode, int index,
|
||||
while (!IS_XATTR_LAST_ENTRY(last)) {
|
||||
if ((void *)(last) + sizeof(__u32) > last_base_addr ||
|
||||
(void *)XATTR_NEXT_ENTRY(last) > last_base_addr) {
|
||||
f2fs_err(F2FS_I_SB(inode), "inode (%lu) has invalid last xattr entry, entry_size: %zu",
|
||||
inode->i_ino, ENTRY_SIZE(last));
|
||||
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
|
||||
error = -EFSCORRUPTED;
|
||||
goto exit;
|
||||
@ -745,19 +742,22 @@ static int __f2fs_setxattr(struct inode *inode, int index,
|
||||
if (error)
|
||||
goto exit;
|
||||
|
||||
if (is_inode_flag_set(inode, FI_ACL_MODE)) {
|
||||
inode->i_mode = F2FS_I(inode)->i_acl_mode;
|
||||
inode->i_ctime = current_time(inode);
|
||||
clear_inode_flag(inode, FI_ACL_MODE);
|
||||
}
|
||||
if (index == F2FS_XATTR_INDEX_ENCRYPTION &&
|
||||
!strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT))
|
||||
f2fs_set_encrypted_inode(inode);
|
||||
f2fs_mark_inode_dirty_sync(inode, true);
|
||||
if (!error && S_ISDIR(inode->i_mode))
|
||||
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP);
|
||||
|
||||
same:
|
||||
if (is_inode_flag_set(inode, FI_ACL_MODE)) {
|
||||
inode->i_mode = F2FS_I(inode)->i_acl_mode;
|
||||
inode->i_ctime = current_time(inode);
|
||||
clear_inode_flag(inode, FI_ACL_MODE);
|
||||
}
|
||||
|
||||
exit:
|
||||
kvfree(base_addr);
|
||||
kfree(base_addr);
|
||||
return error;
|
||||
}
|
||||
|
||||
@ -773,7 +773,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
|
||||
if (!f2fs_is_checkpoint_ready(sbi))
|
||||
return -ENOSPC;
|
||||
|
||||
err = dquot_initialize(inode);
|
||||
err = f2fs_dquot_initialize(inode);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
@ -784,9 +784,9 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
|
||||
f2fs_balance_fs(sbi, true);
|
||||
|
||||
f2fs_lock_op(sbi);
|
||||
down_write(&F2FS_I(inode)->i_xattr_sem);
|
||||
f2fs_down_write(&F2FS_I(inode)->i_xattr_sem);
|
||||
err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags);
|
||||
up_write(&F2FS_I(inode)->i_xattr_sem);
|
||||
f2fs_up_write(&F2FS_I(inode)->i_xattr_sem);
|
||||
f2fs_unlock_op(sbi);
|
||||
|
||||
f2fs_update_time(sbi, REQ_TIME);
|
||||
|
@ -304,6 +304,7 @@ struct file *alloc_file(const struct path *path, fmode_t mode,
|
||||
file->f_inode = path->dentry->d_inode;
|
||||
file->f_mapping = path->dentry->d_inode->i_mapping;
|
||||
file->f_wb_err = filemap_sample_wb_err(file->f_mapping);
|
||||
file->f_sb_err = file_sample_sb_err(file);
|
||||
if ((mode & FMODE_READ) &&
|
||||
likely(fop->read || fop->read_iter))
|
||||
mode |= FMODE_CAN_READ;
|
||||
|
30
fs/inode.c
30
fs/inode.c
@ -1554,25 +1554,31 @@ retry:
|
||||
}
|
||||
EXPORT_SYMBOL(iput);
|
||||
|
||||
#ifdef CONFIG_BLOCK
|
||||
/**
|
||||
* bmap - find a block number in a file
|
||||
* @inode: inode of file
|
||||
* @block: block to find
|
||||
* @inode: inode owning the block number being requested
|
||||
* @block: pointer containing the block to find
|
||||
*
|
||||
* Returns the block number on the device holding the inode that
|
||||
* is the disk block number for the block of the file requested.
|
||||
* That is, asked for block 4 of inode 1 the function will return the
|
||||
* disk block relative to the disk start that holds that block of the
|
||||
* file.
|
||||
* Replaces the value in *block with the block number on the device holding
|
||||
* corresponding to the requested block number in the file.
|
||||
* That is, asked for block 4 of inode 1 the function will replace the
|
||||
* 4 in *block, with disk block relative to the disk start that holds that
|
||||
* block of the file.
|
||||
*
|
||||
* Returns -EINVAL in case of error, 0 otherwise. If mapping falls into a
|
||||
* hole, returns 0 and *block is also set to 0.
|
||||
*/
|
||||
sector_t bmap(struct inode *inode, sector_t block)
|
||||
int bmap(struct inode *inode, sector_t *block)
|
||||
{
|
||||
sector_t res = 0;
|
||||
if (inode->i_mapping->a_ops->bmap)
|
||||
res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block);
|
||||
return res;
|
||||
if (!inode->i_mapping->a_ops->bmap)
|
||||
return -EINVAL;
|
||||
|
||||
*block = inode->i_mapping->a_ops->bmap(inode->i_mapping, *block);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(bmap);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Update times in overlayed inode from underlying real inode
|
||||
|
28
fs/ioctl.c
28
fs/ioctl.c
@ -173,6 +173,34 @@ static int fiemap_check_ranges(struct super_block *sb,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo,
|
||||
u64 start, u64 *len, u32 supported_flags)
|
||||
{
|
||||
u64 maxbytes = inode->i_sb->s_maxbytes;
|
||||
u32 incompat_flags;
|
||||
|
||||
if (*len == 0)
|
||||
return -EINVAL;
|
||||
|
||||
if (start > maxbytes)
|
||||
return -EFBIG;
|
||||
|
||||
/*
|
||||
* Shrink request scope to what the fs can actually handle.
|
||||
*/
|
||||
if (*len > maxbytes || (maxbytes - *len) < start)
|
||||
*len = maxbytes - start;
|
||||
|
||||
supported_flags &= FIEMAP_FLAGS_COMPAT;
|
||||
incompat_flags = fieinfo->fi_flags & ~supported_flags;
|
||||
if (incompat_flags) {
|
||||
fieinfo->fi_flags = incompat_flags;
|
||||
return -EBADR;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(fiemap_prep);
|
||||
|
||||
static int ioctl_fiemap(struct file *filp, unsigned long arg)
|
||||
{
|
||||
struct fiemap fiemap;
|
||||
|
@ -803,18 +803,23 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
|
||||
{
|
||||
int err = 0;
|
||||
unsigned long long ret;
|
||||
sector_t block = 0;
|
||||
|
||||
if (journal->j_inode) {
|
||||
ret = bmap(journal->j_inode, blocknr);
|
||||
if (ret)
|
||||
*retp = ret;
|
||||
else {
|
||||
block = blocknr;
|
||||
ret = bmap(journal->j_inode, &block);
|
||||
|
||||
if (ret || !block) {
|
||||
printk(KERN_ALERT "%s: journal block not found "
|
||||
"at offset %lu on %s\n",
|
||||
__func__, blocknr, journal->j_devname);
|
||||
err = -EIO;
|
||||
__journal_abort_soft(journal, err);
|
||||
|
||||
} else {
|
||||
*retp = block;
|
||||
}
|
||||
|
||||
} else {
|
||||
*retp = blocknr; /* +journal->j_blk_offset */
|
||||
}
|
||||
@ -1240,11 +1245,14 @@ journal_t *jbd2_journal_init_dev(struct block_device *bdev,
|
||||
journal_t *jbd2_journal_init_inode(struct inode *inode)
|
||||
{
|
||||
journal_t *journal;
|
||||
sector_t blocknr;
|
||||
char *p;
|
||||
unsigned long long blocknr;
|
||||
int err = 0;
|
||||
|
||||
blocknr = bmap(inode, 0);
|
||||
if (!blocknr) {
|
||||
blocknr = 0;
|
||||
err = bmap(inode, &blocknr);
|
||||
|
||||
if (err || !blocknr) {
|
||||
pr_err("%s: Cannot locate journal superblock\n",
|
||||
__func__);
|
||||
return NULL;
|
||||
|
110
fs/libfs.c
110
fs/libfs.c
@ -1227,27 +1227,38 @@ bool is_empty_dir_inode(struct inode *inode)
|
||||
}
|
||||
|
||||
#ifdef CONFIG_UNICODE
|
||||
bool needs_casefold(const struct inode *dir)
|
||||
/*
|
||||
* Determine if the name of a dentry should be casefolded.
|
||||
*
|
||||
* Return: if names will need casefolding
|
||||
*/
|
||||
static bool needs_casefold(const struct inode *dir)
|
||||
{
|
||||
return IS_CASEFOLDED(dir) && dir->i_sb->s_encoding &&
|
||||
(!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir));
|
||||
return IS_CASEFOLDED(dir) && dir->i_sb->s_encoding;
|
||||
}
|
||||
EXPORT_SYMBOL(needs_casefold);
|
||||
|
||||
int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
|
||||
const char *str, const struct qstr *name)
|
||||
/**
|
||||
* generic_ci_d_compare - generic d_compare implementation for casefolding filesystems
|
||||
* @dentry: dentry whose name we are checking against
|
||||
* @len: len of name of dentry
|
||||
* @str: str pointer to name of dentry
|
||||
* @name: Name to compare against
|
||||
*
|
||||
* Return: 0 if names match, 1 if mismatch, or -ERRNO
|
||||
*/
|
||||
static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
|
||||
const char *str, const struct qstr *name)
|
||||
{
|
||||
const struct dentry *parent = READ_ONCE(dentry->d_parent);
|
||||
const struct inode *inode = READ_ONCE(parent->d_inode);
|
||||
const struct inode *dir = READ_ONCE(parent->d_inode);
|
||||
const struct super_block *sb = dentry->d_sb;
|
||||
const struct unicode_map *um = sb->s_encoding;
|
||||
struct qstr entry = QSTR_INIT(str, len);
|
||||
struct qstr qstr = QSTR_INIT(str, len);
|
||||
char strbuf[DNAME_INLINE_LEN];
|
||||
int ret;
|
||||
|
||||
if (!inode || !needs_casefold(inode))
|
||||
if (!dir || !needs_casefold(dir))
|
||||
goto fallback;
|
||||
|
||||
/*
|
||||
* If the dentry name is stored in-line, then it may be concurrently
|
||||
* modified by a rename. If this happens, the VFS will eventually retry
|
||||
@ -1258,47 +1269,44 @@ int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
|
||||
if (len <= DNAME_INLINE_LEN - 1) {
|
||||
memcpy(strbuf, str, len);
|
||||
strbuf[len] = 0;
|
||||
entry.name = strbuf;
|
||||
qstr.name = strbuf;
|
||||
/* prevent compiler from optimizing out the temporary buffer */
|
||||
barrier();
|
||||
}
|
||||
|
||||
ret = utf8_strncasecmp(um, name, &entry);
|
||||
ret = utf8_strncasecmp(um, name, &qstr);
|
||||
if (ret >= 0)
|
||||
return ret;
|
||||
|
||||
if (sb_has_enc_strict_mode(sb))
|
||||
if (sb_has_strict_encoding(sb))
|
||||
return -EINVAL;
|
||||
fallback:
|
||||
if (len != name->len)
|
||||
return 1;
|
||||
return !!memcmp(str, name->name, len);
|
||||
}
|
||||
EXPORT_SYMBOL(generic_ci_d_compare);
|
||||
|
||||
int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
|
||||
/**
|
||||
* generic_ci_d_hash - generic d_hash implementation for casefolding filesystems
|
||||
* @dentry: dentry of the parent directory
|
||||
* @str: qstr of name whose hash we should fill in
|
||||
*
|
||||
* Return: 0 if hash was successful or unchanged, and -EINVAL on error
|
||||
*/
|
||||
static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
|
||||
{
|
||||
const struct inode *inode = READ_ONCE(dentry->d_inode);
|
||||
const struct inode *dir = READ_ONCE(dentry->d_inode);
|
||||
struct super_block *sb = dentry->d_sb;
|
||||
const struct unicode_map *um = sb->s_encoding;
|
||||
int ret = 0;
|
||||
|
||||
if (!inode || !needs_casefold(inode))
|
||||
if (!dir || !needs_casefold(dir))
|
||||
return 0;
|
||||
|
||||
ret = utf8_casefold_hash(um, dentry, str);
|
||||
if (ret < 0)
|
||||
goto err;
|
||||
|
||||
if (ret < 0 && sb_has_strict_encoding(sb))
|
||||
return -EINVAL;
|
||||
return 0;
|
||||
err:
|
||||
if (sb_has_enc_strict_mode(sb))
|
||||
ret = -EINVAL;
|
||||
else
|
||||
ret = 0;
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(generic_ci_d_hash);
|
||||
|
||||
static const struct dentry_operations generic_ci_dentry_ops = {
|
||||
.d_hash = generic_ci_d_hash,
|
||||
@ -1312,7 +1320,7 @@ static const struct dentry_operations generic_encrypted_dentry_ops = {
|
||||
};
|
||||
#endif
|
||||
|
||||
#if IS_ENABLED(CONFIG_UNICODE) && IS_ENABLED(CONFIG_FS_ENCRYPTION)
|
||||
#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE)
|
||||
static const struct dentry_operations generic_encrypted_ci_dentry_ops = {
|
||||
.d_hash = generic_ci_d_hash,
|
||||
.d_compare = generic_ci_d_compare,
|
||||
@ -1322,28 +1330,48 @@ static const struct dentry_operations generic_encrypted_ci_dentry_ops = {
|
||||
|
||||
/**
|
||||
* generic_set_encrypted_ci_d_ops - helper for setting d_ops for given dentry
|
||||
* @dir: parent of dentry whose ops to set
|
||||
* @dentry: detnry to set ops on
|
||||
* @dentry: dentry to set ops on
|
||||
*
|
||||
* This function sets the dentry ops for the given dentry to handle both
|
||||
* casefolding and encryption of the dentry name.
|
||||
* Casefolded directories need d_hash and d_compare set, so that the dentries
|
||||
* contained in them are handled case-insensitively. Note that these operations
|
||||
* are needed on the parent directory rather than on the dentries in it, and
|
||||
* while the casefolding flag can be toggled on and off on an empty directory,
|
||||
* dentry_operations can't be changed later. As a result, if the filesystem has
|
||||
* casefolding support enabled at all, we have to give all dentries the
|
||||
* casefolding operations even if their inode doesn't have the casefolding flag
|
||||
* currently (and thus the casefolding ops would be no-ops for now).
|
||||
*
|
||||
* Encryption works differently in that the only dentry operation it needs is
|
||||
* d_revalidate, which it only needs on dentries that have the no-key name flag.
|
||||
* The no-key flag can't be set "later", so we don't have to worry about that.
|
||||
*
|
||||
* Finally, to maximize compatibility with overlayfs (which isn't compatible
|
||||
* with certain dentry operations) and to avoid taking an unnecessary
|
||||
* performance hit, we use custom dentry_operations for each possible
|
||||
* combination rather than always installing all operations.
|
||||
*/
|
||||
void generic_set_encrypted_ci_d_ops(struct inode *dir, struct dentry *dentry)
|
||||
void generic_set_encrypted_ci_d_ops(struct dentry *dentry)
|
||||
{
|
||||
#ifdef CONFIG_FS_ENCRYPTION
|
||||
if (dentry->d_flags & DCACHE_ENCRYPTED_NAME) {
|
||||
#ifdef CONFIG_UNICODE
|
||||
if (dir->i_sb->s_encoding) {
|
||||
d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops);
|
||||
return;
|
||||
}
|
||||
bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME;
|
||||
#endif
|
||||
#ifdef CONFIG_UNICODE
|
||||
bool needs_ci_ops = dentry->d_sb->s_encoding;
|
||||
#endif
|
||||
#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE)
|
||||
if (needs_encrypt_ops && needs_ci_ops) {
|
||||
d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#ifdef CONFIG_FS_ENCRYPTION
|
||||
if (needs_encrypt_ops) {
|
||||
d_set_d_op(dentry, &generic_encrypted_dentry_ops);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#ifdef CONFIG_UNICODE
|
||||
if (dir->i_sb->s_encoding) {
|
||||
if (needs_ci_ops) {
|
||||
d_set_d_op(dentry, &generic_ci_dentry_ops);
|
||||
return;
|
||||
}
|
||||
|
@ -739,9 +739,8 @@ static int do_dentry_open(struct file *f,
|
||||
path_get(&f->f_path);
|
||||
f->f_inode = inode;
|
||||
f->f_mapping = inode->i_mapping;
|
||||
|
||||
/* Ensure that we skip any errors that predate opening of the file */
|
||||
f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
|
||||
f->f_sb_err = file_sample_sb_err(f);
|
||||
|
||||
if (unlikely(f->f_flags & O_PATH)) {
|
||||
f->f_mode = FMODE_PATH;
|
||||
|
@ -710,21 +710,18 @@ EXPORT_SYMBOL(dquot_quota_sync);
|
||||
static unsigned long
|
||||
dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
|
||||
{
|
||||
struct list_head *head;
|
||||
struct dquot *dquot;
|
||||
unsigned long freed = 0;
|
||||
|
||||
spin_lock(&dq_list_lock);
|
||||
head = free_dquots.prev;
|
||||
while (head != &free_dquots && sc->nr_to_scan) {
|
||||
dquot = list_entry(head, struct dquot, dq_free);
|
||||
while (!list_empty(&free_dquots) && sc->nr_to_scan) {
|
||||
dquot = list_first_entry(&free_dquots, struct dquot, dq_free);
|
||||
remove_dquot_hash(dquot);
|
||||
remove_free_dquot(dquot);
|
||||
remove_inuse(dquot);
|
||||
do_destroy_dquot(dquot);
|
||||
sc->nr_to_scan--;
|
||||
freed++;
|
||||
head = free_dquots.prev;
|
||||
}
|
||||
spin_unlock(&dq_list_lock);
|
||||
return freed;
|
||||
@ -2152,7 +2149,7 @@ int dquot_file_open(struct inode *inode, struct file *file)
|
||||
|
||||
error = generic_file_open(inode, file);
|
||||
if (!error && (file->f_mode & FMODE_WRITE))
|
||||
dquot_initialize(inode);
|
||||
error = dquot_initialize(inode);
|
||||
return error;
|
||||
}
|
||||
EXPORT_SYMBOL(dquot_file_open);
|
||||
@ -2971,7 +2968,7 @@ static int __init dquot_init(void)
|
||||
NULL);
|
||||
|
||||
order = 0;
|
||||
dquot_hash = (struct hlist_head *)__get_free_pages(GFP_ATOMIC, order);
|
||||
dquot_hash = (struct hlist_head *)__get_free_pages(GFP_KERNEL, order);
|
||||
if (!dquot_hash)
|
||||
panic("Cannot create dquot hash table");
|
||||
|
||||
|
@ -156,7 +156,7 @@ SYSCALL_DEFINE1(syncfs, int, fd)
|
||||
{
|
||||
struct fd f = fdget(fd);
|
||||
struct super_block *sb;
|
||||
int ret;
|
||||
int ret, ret2;
|
||||
|
||||
if (!f.file)
|
||||
return -EBADF;
|
||||
@ -166,8 +166,10 @@ SYSCALL_DEFINE1(syncfs, int, fd)
|
||||
ret = sync_filesystem(sb);
|
||||
up_read(&sb->s_umount);
|
||||
|
||||
ret2 = errseq_check_and_advance(&sb->s_wb_err, &f.file->f_sb_err);
|
||||
|
||||
fdput(f);
|
||||
return ret;
|
||||
return ret ? ret : ret2;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -222,7 +222,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
|
||||
dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino);
|
||||
|
||||
err = fscrypt_prepare_lookup(dir, dentry, &nm);
|
||||
ubifs_set_d_ops(dir, dentry);
|
||||
generic_set_encrypted_ci_d_ops(dentry);
|
||||
if (err == -ENOENT)
|
||||
return d_splice_alias(NULL, dentry);
|
||||
if (err)
|
||||
|
@ -138,7 +138,7 @@ int utf8_casefold_hash(const struct unicode_map *um, const void *salt,
|
||||
|
||||
while ((c = utf8byte(&cur))) {
|
||||
if (c < 0)
|
||||
return c;
|
||||
return -EINVAL;
|
||||
hash = partial_name_hash((unsigned char)c, hash);
|
||||
}
|
||||
str->hash = end_name_hash(hash);
|
||||
|
@ -5,6 +5,7 @@ obj-$(CONFIG_FS_VERITY) += enable.o \
|
||||
init.o \
|
||||
measure.o \
|
||||
open.o \
|
||||
read_metadata.o \
|
||||
verify.o
|
||||
|
||||
obj-$(CONFIG_FS_VERITY_BUILTIN_SIGNATURES) += signature.o
|
||||
|
@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* fs/verity/enable.c: ioctl to enable verity on a file
|
||||
* Ioctl to enable verity on a file
|
||||
*
|
||||
* Copyright 2019 Google LLC
|
||||
*/
|
||||
@ -398,9 +398,9 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg)
|
||||
* Some pages of the file may have been evicted from pagecache after
|
||||
* being used in the Merkle tree construction, then read into pagecache
|
||||
* again by another process reading from the file concurrently. Since
|
||||
* these pages didn't undergo verification against the file measurement
|
||||
* which fs-verity now claims to be enforcing, we have to wipe the
|
||||
* pagecache to ensure that all future reads are verified.
|
||||
* these pages didn't undergo verification against the file digest which
|
||||
* fs-verity now claims to be enforcing, we have to wipe the pagecache
|
||||
* to ensure that all future reads are verified.
|
||||
*/
|
||||
filemap_write_and_wait(inode->i_mapping);
|
||||
invalidate_inode_pages2(inode->i_mapping);
|
||||
|
@ -67,52 +67,22 @@ struct merkle_tree_params {
|
||||
* When a verity file is first opened, an instance of this struct is allocated
|
||||
* and stored in ->i_verity_info; it remains until the inode is evicted. It
|
||||
* caches information about the Merkle tree that's needed to efficiently verify
|
||||
* data read from the file. It also caches the file measurement. The Merkle
|
||||
* tree pages themselves are not cached here, but the filesystem may cache them.
|
||||
* data read from the file. It also caches the file digest. The Merkle tree
|
||||
* pages themselves are not cached here, but the filesystem may cache them.
|
||||
*/
|
||||
struct fsverity_info {
|
||||
struct merkle_tree_params tree_params;
|
||||
u8 root_hash[FS_VERITY_MAX_DIGEST_SIZE];
|
||||
u8 measurement[FS_VERITY_MAX_DIGEST_SIZE];
|
||||
u8 file_digest[FS_VERITY_MAX_DIGEST_SIZE];
|
||||
const struct inode *inode;
|
||||
};
|
||||
|
||||
/*
|
||||
* Merkle tree properties. The file measurement is the hash of this structure
|
||||
* excluding the signature and with the sig_size field set to 0.
|
||||
*/
|
||||
struct fsverity_descriptor {
|
||||
__u8 version; /* must be 1 */
|
||||
__u8 hash_algorithm; /* Merkle tree hash algorithm */
|
||||
__u8 log_blocksize; /* log2 of size of data and tree blocks */
|
||||
__u8 salt_size; /* size of salt in bytes; 0 if none */
|
||||
__le32 sig_size; /* size of signature in bytes; 0 if none */
|
||||
__le64 data_size; /* size of file the Merkle tree is built over */
|
||||
__u8 root_hash[64]; /* Merkle tree root hash */
|
||||
__u8 salt[32]; /* salt prepended to each hashed block */
|
||||
__u8 __reserved[144]; /* must be 0's */
|
||||
__u8 signature[]; /* optional PKCS#7 signature */
|
||||
};
|
||||
|
||||
/* Arbitrary limit to bound the kmalloc() size. Can be changed. */
|
||||
#define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384
|
||||
|
||||
#define FS_VERITY_MAX_SIGNATURE_SIZE (FS_VERITY_MAX_DESCRIPTOR_SIZE - \
|
||||
sizeof(struct fsverity_descriptor))
|
||||
|
||||
/*
|
||||
* Format in which verity file measurements are signed. This is the same as
|
||||
* 'struct fsverity_digest', except here some magic bytes are prepended to
|
||||
* provide some context about what is being signed in case the same key is used
|
||||
* for non-fsverity purposes, and here the fields have fixed endianness.
|
||||
*/
|
||||
struct fsverity_signed_digest {
|
||||
char magic[8]; /* must be "FSVerity" */
|
||||
__le16 digest_algorithm;
|
||||
__le16 digest_size;
|
||||
__u8 digest[];
|
||||
};
|
||||
|
||||
/* hash_algs.c */
|
||||
|
||||
extern struct fsverity_hash_alg fsverity_hash_algs[];
|
||||
@ -152,12 +122,17 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
|
||||
const u8 *salt, size_t salt_size);
|
||||
|
||||
struct fsverity_info *fsverity_create_info(const struct inode *inode,
|
||||
void *desc, size_t desc_size);
|
||||
struct fsverity_descriptor *desc,
|
||||
size_t desc_size);
|
||||
|
||||
void fsverity_set_info(struct inode *inode, struct fsverity_info *vi);
|
||||
|
||||
void fsverity_free_info(struct fsverity_info *vi);
|
||||
|
||||
int fsverity_get_descriptor(struct inode *inode,
|
||||
struct fsverity_descriptor **desc_ret,
|
||||
size_t *desc_size_ret);
|
||||
|
||||
int __init fsverity_init_info_cache(void);
|
||||
void __init fsverity_exit_info_cache(void);
|
||||
|
||||
@ -165,15 +140,13 @@ void __init fsverity_exit_info_cache(void);
|
||||
|
||||
#ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES
|
||||
int fsverity_verify_signature(const struct fsverity_info *vi,
|
||||
const struct fsverity_descriptor *desc,
|
||||
size_t desc_size);
|
||||
const u8 *signature, size_t sig_size);
|
||||
|
||||
int __init fsverity_init_signature(void);
|
||||
#else /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */
|
||||
static inline int
|
||||
fsverity_verify_signature(const struct fsverity_info *vi,
|
||||
const struct fsverity_descriptor *desc,
|
||||
size_t desc_size)
|
||||
const u8 *signature, size_t sig_size)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* fs/verity/hash_algs.c: fs-verity hash algorithms
|
||||
* fs-verity hash algorithms
|
||||
*
|
||||
* Copyright 2019 Google LLC
|
||||
*/
|
||||
|
@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* fs/verity/init.c: fs-verity module initialization and logging
|
||||
* fs-verity module initialization and logging
|
||||
*
|
||||
* Copyright 2019 Google LLC
|
||||
*/
|
||||
|
@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* fs/verity/measure.c: ioctl to get a verity file's measurement
|
||||
* Ioctl to get a verity file's digest
|
||||
*
|
||||
* Copyright 2019 Google LLC
|
||||
*/
|
||||
@ -10,12 +10,12 @@
|
||||
#include <linux/uaccess.h>
|
||||
|
||||
/**
|
||||
* fsverity_ioctl_measure() - get a verity file's measurement
|
||||
* @filp: file to get measurement of
|
||||
* fsverity_ioctl_measure() - get a verity file's digest
|
||||
* @filp: file to get digest of
|
||||
* @_uarg: user pointer to fsverity_digest
|
||||
*
|
||||
* Retrieve the file measurement that the kernel is enforcing for reads from a
|
||||
* verity file. See the "FS_IOC_MEASURE_VERITY" section of
|
||||
* Retrieve the file digest that the kernel is enforcing for reads from a verity
|
||||
* file. See the "FS_IOC_MEASURE_VERITY" section of
|
||||
* Documentation/filesystems/fsverity.rst for the documentation.
|
||||
*
|
||||
* Return: 0 on success, -errno on failure
|
||||
@ -51,7 +51,7 @@ int fsverity_ioctl_measure(struct file *filp, void __user *_uarg)
|
||||
if (copy_to_user(uarg, &arg, sizeof(arg)))
|
||||
return -EFAULT;
|
||||
|
||||
if (copy_to_user(uarg->digest, vi->measurement, hash_alg->digest_size))
|
||||
if (copy_to_user(uarg->digest, vi->file_digest, hash_alg->digest_size))
|
||||
return -EFAULT;
|
||||
|
||||
return 0;
|
||||
|
170
fs/verity/open.c
170
fs/verity/open.c
@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* fs/verity/open.c: opening fs-verity files
|
||||
* Opening fs-verity files
|
||||
*
|
||||
* Copyright 2019 Google LLC
|
||||
*/
|
||||
@ -124,63 +124,35 @@ out_err:
|
||||
}
|
||||
|
||||
/*
|
||||
* Compute the file measurement by hashing the fsverity_descriptor excluding the
|
||||
* Compute the file digest by hashing the fsverity_descriptor excluding the
|
||||
* signature and with the sig_size field set to 0.
|
||||
*/
|
||||
static int compute_file_measurement(struct fsverity_hash_alg *hash_alg,
|
||||
struct fsverity_descriptor *desc,
|
||||
u8 *measurement)
|
||||
static int compute_file_digest(struct fsverity_hash_alg *hash_alg,
|
||||
struct fsverity_descriptor *desc,
|
||||
u8 *file_digest)
|
||||
{
|
||||
__le32 sig_size = desc->sig_size;
|
||||
int err;
|
||||
|
||||
desc->sig_size = 0;
|
||||
err = fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), measurement);
|
||||
err = fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), file_digest);
|
||||
desc->sig_size = sig_size;
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* Validate the given fsverity_descriptor and create a new fsverity_info from
|
||||
* it. The signature (if present) is also checked.
|
||||
* Create a new fsverity_info from the given fsverity_descriptor (with optional
|
||||
* appended signature), and check the signature if present. The
|
||||
* fsverity_descriptor must have already undergone basic validation.
|
||||
*/
|
||||
struct fsverity_info *fsverity_create_info(const struct inode *inode,
|
||||
void *_desc, size_t desc_size)
|
||||
struct fsverity_descriptor *desc,
|
||||
size_t desc_size)
|
||||
{
|
||||
struct fsverity_descriptor *desc = _desc;
|
||||
struct fsverity_info *vi;
|
||||
int err;
|
||||
|
||||
if (desc_size < sizeof(*desc)) {
|
||||
fsverity_err(inode, "Unrecognized descriptor size: %zu bytes",
|
||||
desc_size);
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
if (desc->version != 1) {
|
||||
fsverity_err(inode, "Unrecognized descriptor version: %u",
|
||||
desc->version);
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
if (memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) {
|
||||
fsverity_err(inode, "Reserved bits set in descriptor");
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
if (desc->salt_size > sizeof(desc->salt)) {
|
||||
fsverity_err(inode, "Invalid salt_size: %u", desc->salt_size);
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
if (le64_to_cpu(desc->data_size) != inode->i_size) {
|
||||
fsverity_err(inode,
|
||||
"Wrong data_size: %llu (desc) != %lld (inode)",
|
||||
le64_to_cpu(desc->data_size), inode->i_size);
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
vi = kmem_cache_zalloc(fsverity_info_cachep, GFP_KERNEL);
|
||||
if (!vi)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
@ -199,17 +171,18 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode,
|
||||
|
||||
memcpy(vi->root_hash, desc->root_hash, vi->tree_params.digest_size);
|
||||
|
||||
err = compute_file_measurement(vi->tree_params.hash_alg, desc,
|
||||
vi->measurement);
|
||||
err = compute_file_digest(vi->tree_params.hash_alg, desc,
|
||||
vi->file_digest);
|
||||
if (err) {
|
||||
fsverity_err(inode, "Error %d computing file measurement", err);
|
||||
fsverity_err(inode, "Error %d computing file digest", err);
|
||||
goto out;
|
||||
}
|
||||
pr_debug("Computed file measurement: %s:%*phN\n",
|
||||
pr_debug("Computed file digest: %s:%*phN\n",
|
||||
vi->tree_params.hash_alg->name,
|
||||
vi->tree_params.digest_size, vi->measurement);
|
||||
vi->tree_params.digest_size, vi->file_digest);
|
||||
|
||||
err = fsverity_verify_signature(vi, desc, desc_size);
|
||||
err = fsverity_verify_signature(vi, desc->signature,
|
||||
le32_to_cpu(desc->sig_size));
|
||||
out:
|
||||
if (err) {
|
||||
fsverity_free_info(vi);
|
||||
@ -221,11 +194,20 @@ out:
|
||||
void fsverity_set_info(struct inode *inode, struct fsverity_info *vi)
|
||||
{
|
||||
/*
|
||||
* Multiple processes may race to set ->i_verity_info, so use cmpxchg.
|
||||
* This pairs with the READ_ONCE() in fsverity_get_info().
|
||||
* Multiple tasks may race to set ->i_verity_info, so use
|
||||
* cmpxchg_release(). This pairs with the smp_load_acquire() in
|
||||
* fsverity_get_info(). I.e., here we publish ->i_verity_info with a
|
||||
* RELEASE barrier so that other tasks can ACQUIRE it.
|
||||
*/
|
||||
if (cmpxchg(&inode->i_verity_info, NULL, vi) != NULL)
|
||||
if (cmpxchg_release(&inode->i_verity_info, NULL, vi) != NULL) {
|
||||
/* Lost the race, so free the fsverity_info we allocated. */
|
||||
fsverity_free_info(vi);
|
||||
/*
|
||||
* Afterwards, the caller may access ->i_verity_info directly,
|
||||
* so make sure to ACQUIRE the winning fsverity_info.
|
||||
*/
|
||||
(void)fsverity_get_info(inode);
|
||||
}
|
||||
}
|
||||
|
||||
void fsverity_free_info(struct fsverity_info *vi)
|
||||
@ -236,15 +218,57 @@ void fsverity_free_info(struct fsverity_info *vi)
|
||||
kmem_cache_free(fsverity_info_cachep, vi);
|
||||
}
|
||||
|
||||
/* Ensure the inode has an ->i_verity_info */
|
||||
static int ensure_verity_info(struct inode *inode)
|
||||
static bool validate_fsverity_descriptor(struct inode *inode,
|
||||
const struct fsverity_descriptor *desc,
|
||||
size_t desc_size)
|
||||
{
|
||||
struct fsverity_info *vi = fsverity_get_info(inode);
|
||||
struct fsverity_descriptor *desc;
|
||||
int res;
|
||||
if (desc_size < sizeof(*desc)) {
|
||||
fsverity_err(inode, "Unrecognized descriptor size: %zu bytes",
|
||||
desc_size);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (vi)
|
||||
return 0;
|
||||
if (desc->version != 1) {
|
||||
fsverity_err(inode, "Unrecognized descriptor version: %u",
|
||||
desc->version);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) {
|
||||
fsverity_err(inode, "Reserved bits set in descriptor");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (desc->salt_size > sizeof(desc->salt)) {
|
||||
fsverity_err(inode, "Invalid salt_size: %u", desc->salt_size);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (le64_to_cpu(desc->data_size) != inode->i_size) {
|
||||
fsverity_err(inode,
|
||||
"Wrong data_size: %llu (desc) != %lld (inode)",
|
||||
le64_to_cpu(desc->data_size), inode->i_size);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (le32_to_cpu(desc->sig_size) > desc_size - sizeof(*desc)) {
|
||||
fsverity_err(inode, "Signature overflows verity descriptor");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Read the inode's fsverity_descriptor (with optional appended signature) from
|
||||
* the filesystem, and do basic validation of it.
|
||||
*/
|
||||
int fsverity_get_descriptor(struct inode *inode,
|
||||
struct fsverity_descriptor **desc_ret,
|
||||
size_t *desc_size_ret)
|
||||
{
|
||||
int res;
|
||||
struct fsverity_descriptor *desc;
|
||||
|
||||
res = inode->i_sb->s_vop->get_verity_descriptor(inode, NULL, 0);
|
||||
if (res < 0) {
|
||||
@ -263,20 +287,46 @@ static int ensure_verity_info(struct inode *inode)
|
||||
res = inode->i_sb->s_vop->get_verity_descriptor(inode, desc, res);
|
||||
if (res < 0) {
|
||||
fsverity_err(inode, "Error %d reading verity descriptor", res);
|
||||
goto out_free_desc;
|
||||
kfree(desc);
|
||||
return res;
|
||||
}
|
||||
|
||||
vi = fsverity_create_info(inode, desc, res);
|
||||
if (!validate_fsverity_descriptor(inode, desc, res)) {
|
||||
kfree(desc);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
*desc_ret = desc;
|
||||
*desc_size_ret = res;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Ensure the inode has an ->i_verity_info */
|
||||
static int ensure_verity_info(struct inode *inode)
|
||||
{
|
||||
struct fsverity_info *vi = fsverity_get_info(inode);
|
||||
struct fsverity_descriptor *desc;
|
||||
size_t desc_size;
|
||||
int err;
|
||||
|
||||
if (vi)
|
||||
return 0;
|
||||
|
||||
err = fsverity_get_descriptor(inode, &desc, &desc_size);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
vi = fsverity_create_info(inode, desc, desc_size);
|
||||
if (IS_ERR(vi)) {
|
||||
res = PTR_ERR(vi);
|
||||
err = PTR_ERR(vi);
|
||||
goto out_free_desc;
|
||||
}
|
||||
|
||||
fsverity_set_info(inode, vi);
|
||||
res = 0;
|
||||
err = 0;
|
||||
out_free_desc:
|
||||
kfree(desc);
|
||||
return res;
|
||||
return err;
|
||||
}
|
||||
|
||||
/**
|
||||
|
195
fs/verity/read_metadata.c
Normal file
195
fs/verity/read_metadata.c
Normal file
@ -0,0 +1,195 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
* Ioctl to read verity metadata
|
||||
*
|
||||
* Copyright 2021 Google LLC
|
||||
*/
|
||||
|
||||
#include "fsverity_private.h"
|
||||
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/sched/signal.h>
|
||||
#include <linux/uaccess.h>
|
||||
|
||||
static int fsverity_read_merkle_tree(struct inode *inode,
|
||||
const struct fsverity_info *vi,
|
||||
void __user *buf, u64 offset, int length)
|
||||
{
|
||||
const struct fsverity_operations *vops = inode->i_sb->s_vop;
|
||||
u64 end_offset;
|
||||
unsigned int offs_in_page;
|
||||
pgoff_t index, last_index;
|
||||
int retval = 0;
|
||||
int err = 0;
|
||||
|
||||
end_offset = min(offset + length, vi->tree_params.tree_size);
|
||||
if (offset >= end_offset)
|
||||
return 0;
|
||||
offs_in_page = offset_in_page(offset);
|
||||
last_index = (end_offset - 1) >> PAGE_SHIFT;
|
||||
|
||||
/*
|
||||
* Iterate through each Merkle tree page in the requested range and copy
|
||||
* the requested portion to userspace. Note that the Merkle tree block
|
||||
* size isn't important here, as we are returning a byte stream; i.e.,
|
||||
* we can just work with pages even if the tree block size != PAGE_SIZE.
|
||||
*/
|
||||
for (index = offset >> PAGE_SHIFT; index <= last_index; index++) {
|
||||
unsigned long num_ra_pages =
|
||||
min_t(unsigned long, last_index - index + 1,
|
||||
inode->i_sb->s_bdi->io_pages);
|
||||
unsigned int bytes_to_copy = min_t(u64, end_offset - offset,
|
||||
PAGE_SIZE - offs_in_page);
|
||||
struct page *page;
|
||||
const void *virt;
|
||||
|
||||
page = vops->read_merkle_tree_page(inode, index, num_ra_pages);
|
||||
if (IS_ERR(page)) {
|
||||
err = PTR_ERR(page);
|
||||
fsverity_err(inode,
|
||||
"Error %d reading Merkle tree page %lu",
|
||||
err, index);
|
||||
break;
|
||||
}
|
||||
|
||||
virt = kmap(page);
|
||||
if (copy_to_user(buf, virt + offs_in_page, bytes_to_copy)) {
|
||||
kunmap(page);
|
||||
put_page(page);
|
||||
err = -EFAULT;
|
||||
break;
|
||||
}
|
||||
kunmap(page);
|
||||
put_page(page);
|
||||
|
||||
retval += bytes_to_copy;
|
||||
buf += bytes_to_copy;
|
||||
offset += bytes_to_copy;
|
||||
|
||||
if (fatal_signal_pending(current)) {
|
||||
err = -EINTR;
|
||||
break;
|
||||
}
|
||||
cond_resched();
|
||||
offs_in_page = 0;
|
||||
}
|
||||
return retval ? retval : err;
|
||||
}
|
||||
|
||||
/* Copy the requested portion of the buffer to userspace. */
|
||||
static int fsverity_read_buffer(void __user *dst, u64 offset, int length,
|
||||
const void *src, size_t src_length)
|
||||
{
|
||||
if (offset >= src_length)
|
||||
return 0;
|
||||
src += offset;
|
||||
src_length -= offset;
|
||||
|
||||
length = min_t(size_t, length, src_length);
|
||||
|
||||
if (copy_to_user(dst, src, length))
|
||||
return -EFAULT;
|
||||
|
||||
return length;
|
||||
}
|
||||
|
||||
static int fsverity_read_descriptor(struct inode *inode,
|
||||
void __user *buf, u64 offset, int length)
|
||||
{
|
||||
struct fsverity_descriptor *desc;
|
||||
size_t desc_size;
|
||||
int res;
|
||||
|
||||
res = fsverity_get_descriptor(inode, &desc, &desc_size);
|
||||
if (res)
|
||||
return res;
|
||||
|
||||
/* don't include the signature */
|
||||
desc_size = offsetof(struct fsverity_descriptor, signature);
|
||||
desc->sig_size = 0;
|
||||
|
||||
res = fsverity_read_buffer(buf, offset, length, desc, desc_size);
|
||||
|
||||
kfree(desc);
|
||||
return res;
|
||||
}
|
||||
|
||||
static int fsverity_read_signature(struct inode *inode,
|
||||
void __user *buf, u64 offset, int length)
|
||||
{
|
||||
struct fsverity_descriptor *desc;
|
||||
size_t desc_size;
|
||||
int res;
|
||||
|
||||
res = fsverity_get_descriptor(inode, &desc, &desc_size);
|
||||
if (res)
|
||||
return res;
|
||||
|
||||
if (desc->sig_size == 0) {
|
||||
res = -ENODATA;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Include only the signature. Note that fsverity_get_descriptor()
|
||||
* already verified that sig_size is in-bounds.
|
||||
*/
|
||||
res = fsverity_read_buffer(buf, offset, length, desc->signature,
|
||||
le32_to_cpu(desc->sig_size));
|
||||
out:
|
||||
kfree(desc);
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* fsverity_ioctl_read_metadata() - read verity metadata from a file
|
||||
* @filp: file to read the metadata from
|
||||
* @uarg: user pointer to fsverity_read_metadata_arg
|
||||
*
|
||||
* Return: length read on success, 0 on EOF, -errno on failure
|
||||
*/
|
||||
int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg)
|
||||
{
|
||||
struct inode *inode = file_inode(filp);
|
||||
const struct fsverity_info *vi;
|
||||
struct fsverity_read_metadata_arg arg;
|
||||
int length;
|
||||
void __user *buf;
|
||||
|
||||
vi = fsverity_get_info(inode);
|
||||
if (!vi)
|
||||
return -ENODATA; /* not a verity file */
|
||||
/*
|
||||
* Note that we don't have to explicitly check that the file is open for
|
||||
* reading, since verity files can only be opened for reading.
|
||||
*/
|
||||
|
||||
if (copy_from_user(&arg, uarg, sizeof(arg)))
|
||||
return -EFAULT;
|
||||
|
||||
if (arg.__reserved)
|
||||
return -EINVAL;
|
||||
|
||||
/* offset + length must not overflow. */
|
||||
if (arg.offset + arg.length < arg.offset)
|
||||
return -EINVAL;
|
||||
|
||||
/* Ensure that the return value will fit in INT_MAX. */
|
||||
length = min_t(u64, arg.length, INT_MAX);
|
||||
|
||||
buf = u64_to_user_ptr(arg.buf_ptr);
|
||||
|
||||
switch (arg.metadata_type) {
|
||||
case FS_VERITY_METADATA_TYPE_MERKLE_TREE:
|
||||
return fsverity_read_merkle_tree(inode, vi, buf, arg.offset,
|
||||
length);
|
||||
case FS_VERITY_METADATA_TYPE_DESCRIPTOR:
|
||||
return fsverity_read_descriptor(inode, buf, arg.offset, length);
|
||||
case FS_VERITY_METADATA_TYPE_SIGNATURE:
|
||||
return fsverity_read_signature(inode, buf, arg.offset, length);
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(fsverity_ioctl_read_metadata);
|
@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* fs/verity/signature.c: verification of builtin signatures
|
||||
* Verification of builtin signatures
|
||||
*
|
||||
* Copyright 2019 Google LLC
|
||||
*/
|
||||
@ -29,22 +29,20 @@ static struct key *fsverity_keyring;
|
||||
/**
|
||||
* fsverity_verify_signature() - check a verity file's signature
|
||||
* @vi: the file's fsverity_info
|
||||
* @desc: the file's fsverity_descriptor
|
||||
* @desc_size: size of @desc
|
||||
* @signature: the file's built-in signature
|
||||
* @sig_size: size of signature in bytes, or 0 if no signature
|
||||
*
|
||||
* If the file's fs-verity descriptor includes a signature of the file
|
||||
* measurement, verify it against the certificates in the fs-verity keyring.
|
||||
* If the file includes a signature of its fs-verity file digest, verify it
|
||||
* against the certificates in the fs-verity keyring.
|
||||
*
|
||||
* Return: 0 on success (signature valid or not required); -errno on failure
|
||||
*/
|
||||
int fsverity_verify_signature(const struct fsverity_info *vi,
|
||||
const struct fsverity_descriptor *desc,
|
||||
size_t desc_size)
|
||||
const u8 *signature, size_t sig_size)
|
||||
{
|
||||
const struct inode *inode = vi->inode;
|
||||
const struct fsverity_hash_alg *hash_alg = vi->tree_params.hash_alg;
|
||||
const u32 sig_size = le32_to_cpu(desc->sig_size);
|
||||
struct fsverity_signed_digest *d;
|
||||
struct fsverity_formatted_digest *d;
|
||||
int err;
|
||||
|
||||
if (sig_size == 0) {
|
||||
@ -56,22 +54,16 @@ int fsverity_verify_signature(const struct fsverity_info *vi,
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (sig_size > desc_size - sizeof(*desc)) {
|
||||
fsverity_err(inode, "Signature overflows verity descriptor");
|
||||
return -EBADMSG;
|
||||
}
|
||||
|
||||
d = kzalloc(sizeof(*d) + hash_alg->digest_size, GFP_KERNEL);
|
||||
if (!d)
|
||||
return -ENOMEM;
|
||||
memcpy(d->magic, "FSVerity", 8);
|
||||
d->digest_algorithm = cpu_to_le16(hash_alg - fsverity_hash_algs);
|
||||
d->digest_size = cpu_to_le16(hash_alg->digest_size);
|
||||
memcpy(d->digest, vi->measurement, hash_alg->digest_size);
|
||||
memcpy(d->digest, vi->file_digest, hash_alg->digest_size);
|
||||
|
||||
err = verify_pkcs7_signature(d, sizeof(*d) + hash_alg->digest_size,
|
||||
desc->signature, sig_size,
|
||||
fsverity_keyring,
|
||||
signature, sig_size, fsverity_keyring,
|
||||
VERIFYING_UNSPECIFIED_SIGNATURE,
|
||||
NULL, NULL);
|
||||
kfree(d);
|
||||
@ -90,8 +82,8 @@ int fsverity_verify_signature(const struct fsverity_info *vi,
|
||||
return err;
|
||||
}
|
||||
|
||||
pr_debug("Valid signature for file measurement %s:%*phN\n",
|
||||
hash_alg->name, hash_alg->digest_size, vi->measurement);
|
||||
pr_debug("Valid signature for file digest %s:%*phN\n",
|
||||
hash_alg->name, hash_alg->digest_size, vi->file_digest);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* fs/verity/verify.c: data verification functions, i.e. hooks for ->readpages()
|
||||
* Data verification functions, i.e. hooks for ->readpages()
|
||||
*
|
||||
* Copyright 2019 Google LLC
|
||||
*/
|
||||
|
@ -219,7 +219,7 @@ struct dentry_operations {
|
||||
|
||||
#define DCACHE_MAY_FREE 0x00800000
|
||||
#define DCACHE_FALLTHRU 0x01000000 /* Fall through to lower layer */
|
||||
#define DCACHE_ENCRYPTED_NAME 0x02000000 /* Encrypted name (dir key was unavailable) */
|
||||
#define DCACHE_NOKEY_NAME 0x02000000 /* Encrypted name encoded without key */
|
||||
#define DCACHE_OP_REAL 0x04000000
|
||||
|
||||
#define DCACHE_PAR_LOOKUP 0x10000000 /* being looked up (with parent locked shared) */
|
||||
|
@ -34,6 +34,7 @@
|
||||
#define F2FS_ROOT_INO(sbi) ((sbi)->root_ino_num)
|
||||
#define F2FS_NODE_INO(sbi) ((sbi)->node_ino_num)
|
||||
#define F2FS_META_INO(sbi) ((sbi)->meta_ino_num)
|
||||
#define F2FS_COMPRESS_INO(sbi) (NM_I(sbi)->max_nid)
|
||||
|
||||
#define F2FS_MAX_QUOTAS 3
|
||||
|
||||
@ -168,7 +169,7 @@ struct f2fs_checkpoint {
|
||||
unsigned char alloc_type[MAX_ACTIVE_LOGS];
|
||||
|
||||
/* SIT and NAT version bitmap */
|
||||
unsigned char sit_nat_version_bitmap[1];
|
||||
unsigned char sit_nat_version_bitmap[];
|
||||
} __packed;
|
||||
|
||||
#define CP_CHKSUM_OFFSET 4092 /* default chksum offset in checkpoint */
|
||||
@ -229,6 +230,7 @@ struct f2fs_extent {
|
||||
#define F2FS_INLINE_DOTS 0x10 /* file having implicit dot dentries */
|
||||
#define F2FS_EXTRA_ATTR 0x20 /* file having extra attribute */
|
||||
#define F2FS_PIN_FILE 0x40 /* file should not be gced */
|
||||
#define F2FS_COMPRESS_RELEASED 0x80 /* file released compressed blocks */
|
||||
|
||||
struct f2fs_inode {
|
||||
__le16 i_mode; /* file mode */
|
||||
@ -273,7 +275,10 @@ struct f2fs_inode {
|
||||
__le64 i_compr_blocks; /* # of compressed blocks */
|
||||
__u8 i_compress_algorithm; /* compress algorithm */
|
||||
__u8 i_log_cluster_size; /* log of cluster size */
|
||||
__le16 i_padding; /* padding */
|
||||
__le16 i_compress_flag; /* compress flag */
|
||||
/* 0 bit: chksum flag
|
||||
* [10,15] bits: compress level
|
||||
*/
|
||||
__le32 i_extra_end[0]; /* for attribute size calculation */
|
||||
} __packed;
|
||||
__le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */
|
||||
|
@ -901,9 +901,7 @@ struct file {
|
||||
#endif /* #ifdef CONFIG_EPOLL */
|
||||
struct address_space *f_mapping;
|
||||
errseq_t f_wb_err;
|
||||
#ifdef CONFIG_FILE_TABLE_DEBUG
|
||||
struct hlist_node f_hash;
|
||||
#endif /* #ifdef CONFIG_FILE_TABLE_DEBUG */
|
||||
errseq_t f_sb_err; /* for syncfs */
|
||||
} __randomize_layout
|
||||
__attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
|
||||
|
||||
@ -1320,7 +1318,7 @@ extern int send_sigurg(struct fown_struct *fown);
|
||||
/* These flags relate to encoding and casefolding */
|
||||
#define SB_ENC_STRICT_MODE_FL (1 << 0)
|
||||
|
||||
#define sb_has_enc_strict_mode(sb) \
|
||||
#define sb_has_strict_encoding(sb) \
|
||||
(sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL)
|
||||
|
||||
/*
|
||||
@ -1389,7 +1387,10 @@ struct super_block {
|
||||
#ifdef CONFIG_FS_VERITY
|
||||
const struct fsverity_operations *s_vop;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_UNICODE
|
||||
struct unicode_map *s_encoding;
|
||||
__u16 s_encoding_flags;
|
||||
#endif
|
||||
struct hlist_bl_head s_anon; /* anonymous dentries for (nfs) exporting */
|
||||
#ifdef CONFIG_UNICODE
|
||||
struct unicode_map *s_encoding;
|
||||
@ -1443,6 +1444,9 @@ struct super_block {
|
||||
/* Being remounted read-only */
|
||||
int s_readonly_remount;
|
||||
|
||||
/* per-sb errseq_t for reporting writeback errors via syncfs */
|
||||
errseq_t s_wb_err;
|
||||
|
||||
/* AIO completions deferred from interrupt context */
|
||||
struct workqueue_struct *s_dio_done_wq;
|
||||
struct hlist_head s_pins;
|
||||
@ -1668,6 +1672,9 @@ struct fiemap_extent_info {
|
||||
struct fiemap_extent __user *fi_extents_start; /* Start of
|
||||
fiemap_extent array */
|
||||
};
|
||||
|
||||
int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo,
|
||||
u64 start, u64 *len, u32 supported_flags);
|
||||
int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical,
|
||||
u64 phys, u64 len, u32 flags);
|
||||
int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
|
||||
@ -2733,6 +2740,18 @@ static inline errseq_t filemap_sample_wb_err(struct address_space *mapping)
|
||||
return errseq_sample(&mapping->wb_err);
|
||||
}
|
||||
|
||||
/**
|
||||
* file_sample_sb_err - sample the current errseq_t to test for later errors
|
||||
* @mapping: mapping to be sampled
|
||||
*
|
||||
* Grab the most current superblock-level errseq_t value for the given
|
||||
* struct file.
|
||||
*/
|
||||
static inline errseq_t file_sample_sb_err(struct file *file)
|
||||
{
|
||||
return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err);
|
||||
}
|
||||
|
||||
extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
|
||||
int datasync);
|
||||
extern int vfs_fsync(struct file *file, int datasync);
|
||||
@ -2757,9 +2776,16 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
|
||||
|
||||
extern void emergency_sync(void);
|
||||
extern void emergency_remount(void);
|
||||
|
||||
#ifdef CONFIG_BLOCK
|
||||
extern sector_t bmap(struct inode *, sector_t);
|
||||
extern int bmap(struct inode *inode, sector_t *block);
|
||||
#else
|
||||
static inline int bmap(struct inode *inode, sector_t *block)
|
||||
{
|
||||
return -EINVAL;
|
||||
}
|
||||
#endif
|
||||
|
||||
extern int notify_change(struct dentry *, struct iattr *, struct inode **);
|
||||
extern int notify_change2(struct vfsmount *, struct dentry *, struct iattr *, struct inode **);
|
||||
extern int inode_permission(struct inode *, int);
|
||||
@ -3225,19 +3251,7 @@ extern int generic_file_fsync(struct file *, loff_t, loff_t, int);
|
||||
|
||||
extern int generic_check_addressable(unsigned, u64);
|
||||
|
||||
#ifdef CONFIG_UNICODE
|
||||
extern int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str);
|
||||
extern int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
|
||||
const char *str, const struct qstr *name);
|
||||
extern bool needs_casefold(const struct inode *dir);
|
||||
#else
|
||||
static inline bool needs_casefold(const struct inode *dir)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
extern void generic_set_encrypted_ci_d_ops(struct inode *dir,
|
||||
struct dentry *dentry);
|
||||
extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry);
|
||||
|
||||
#ifdef CONFIG_MIGRATION
|
||||
extern int buffer_migrate_page(struct address_space *,
|
||||
|
@ -36,7 +36,7 @@ struct fscrypt_name {
|
||||
u32 hash;
|
||||
u32 minor_hash;
|
||||
struct fscrypt_str crypto_buf;
|
||||
bool is_ciphertext_name;
|
||||
bool is_nokey_name;
|
||||
};
|
||||
|
||||
#define FSTR_INIT(n, l) { .name = n, .len = l }
|
||||
@ -107,15 +107,15 @@ fscrypt_get_dummy_context(struct super_block *sb)
|
||||
}
|
||||
|
||||
/*
|
||||
* When d_splice_alias() moves a directory's encrypted alias to its decrypted
|
||||
* alias as a result of the encryption key being added, DCACHE_ENCRYPTED_NAME
|
||||
* must be cleared. Note that we don't have to support arbitrary moves of this
|
||||
* flag because fscrypt doesn't allow encrypted aliases to be the source or
|
||||
* target of a rename().
|
||||
* When d_splice_alias() moves a directory's no-key alias to its plaintext alias
|
||||
* as a result of the encryption key being added, DCACHE_NOKEY_NAME must be
|
||||
* cleared. Note that we don't have to support arbitrary moves of this flag
|
||||
* because fscrypt doesn't allow no-key names to be the source or target of a
|
||||
* rename().
|
||||
*/
|
||||
static inline void fscrypt_handle_d_move(struct dentry *dentry)
|
||||
{
|
||||
dentry->d_flags &= ~DCACHE_ENCRYPTED_NAME;
|
||||
dentry->d_flags &= ~DCACHE_NOKEY_NAME;
|
||||
}
|
||||
|
||||
/* crypto.c */
|
||||
@ -207,6 +207,7 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode,
|
||||
bool fscrypt_match_name(const struct fscrypt_name *fname,
|
||||
const u8 *de_name, u32 de_name_len);
|
||||
u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name);
|
||||
int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags);
|
||||
|
||||
/* bio.c */
|
||||
void fscrypt_decrypt_bio(struct bio *bio);
|
||||
@ -471,6 +472,12 @@ static inline u64 fscrypt_fname_siphash(const struct inode *dir,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int fscrypt_d_revalidate(struct dentry *dentry,
|
||||
unsigned int flags)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* bio.c */
|
||||
static inline void fscrypt_decrypt_bio(struct bio *bio)
|
||||
{
|
||||
@ -729,18 +736,19 @@ static inline int fscrypt_prepare_rename(struct inode *old_dir,
|
||||
* @fname: (output) the name to use to search the on-disk directory
|
||||
*
|
||||
* Prepare for ->lookup() in a directory which may be encrypted by determining
|
||||
* the name that will actually be used to search the directory on-disk. Lookups
|
||||
* can be done with or without the directory's encryption key; without the key,
|
||||
* filenames are presented in encrypted form. Therefore, we'll try to set up
|
||||
* the directory's encryption key, but even without it the lookup can continue.
|
||||
* the name that will actually be used to search the directory on-disk. If the
|
||||
* directory's encryption key is available, then the lookup is assumed to be by
|
||||
* plaintext name; otherwise, it is assumed to be by no-key name.
|
||||
*
|
||||
* After calling this function, a filesystem should ensure that it's dentry
|
||||
* operations contain fscrypt_d_revalidate if DCACHE_ENCRYPTED_NAME was set,
|
||||
* so that the dentry can be invalidated if the key is later added.
|
||||
* This will set DCACHE_NOKEY_NAME on the dentry if the lookup is by no-key
|
||||
* name. In this case the filesystem must assign the dentry a dentry_operations
|
||||
* which contains fscrypt_d_revalidate (or contains a d_revalidate method that
|
||||
* calls fscrypt_d_revalidate), so that the dentry will be invalidated if the
|
||||
* directory's encryption key is later added.
|
||||
*
|
||||
* Return: 0 on success; -ENOENT if key is unavailable but the filename isn't a
|
||||
* correctly formed encoded ciphertext name, so a negative dentry should be
|
||||
* created; or another -errno code.
|
||||
* Return: 0 on success; -ENOENT if the directory's key is unavailable but the
|
||||
* filename isn't a valid no-key name, so a negative dentry should be created;
|
||||
* or another -errno code.
|
||||
*/
|
||||
static inline int fscrypt_prepare_lookup(struct inode *dir,
|
||||
struct dentry *dentry,
|
||||
|
@ -115,8 +115,13 @@ struct fsverity_operations {
|
||||
|
||||
static inline struct fsverity_info *fsverity_get_info(const struct inode *inode)
|
||||
{
|
||||
/* pairs with the cmpxchg() in fsverity_set_info() */
|
||||
return READ_ONCE(inode->i_verity_info);
|
||||
/*
|
||||
* Pairs with the cmpxchg_release() in fsverity_set_info().
|
||||
* I.e., another task may publish ->i_verity_info concurrently,
|
||||
* executing a RELEASE barrier. We need to use smp_load_acquire() here
|
||||
* to safely ACQUIRE the memory the other task published.
|
||||
*/
|
||||
return smp_load_acquire(&inode->i_verity_info);
|
||||
}
|
||||
|
||||
/* enable.c */
|
||||
@ -133,6 +138,10 @@ int fsverity_file_open(struct inode *inode, struct file *filp);
|
||||
int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr);
|
||||
void fsverity_cleanup_inode(struct inode *inode);
|
||||
|
||||
/* read_metadata.c */
|
||||
|
||||
int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg);
|
||||
|
||||
/* verify.c */
|
||||
|
||||
bool fsverity_verify_page(struct page *page);
|
||||
@ -178,6 +187,14 @@ static inline void fsverity_cleanup_inode(struct inode *inode)
|
||||
{
|
||||
}
|
||||
|
||||
/* read_metadata.c */
|
||||
|
||||
static inline int fsverity_ioctl_read_metadata(struct file *filp,
|
||||
const void __user *uarg)
|
||||
{
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
/* verify.c */
|
||||
|
||||
static inline bool fsverity_verify_page(struct page *page)
|
||||
|
@ -49,7 +49,10 @@ static inline void mapping_set_error(struct address_space *mapping, int error)
|
||||
return;
|
||||
|
||||
/* Record in wb_err for checkers using errseq_t based tracking */
|
||||
filemap_set_wb_err(mapping, error);
|
||||
__filemap_set_wb_err(mapping, error);
|
||||
|
||||
/* Record it in superblock */
|
||||
errseq_set(&mapping->host->i_sb->s_wb_err, error);
|
||||
|
||||
/* Record it in flags for now, for legacy callers */
|
||||
if (error == -ENOSPC)
|
||||
|
@ -6,6 +6,7 @@
|
||||
#define _TRACE_F2FS_H
|
||||
|
||||
#include <linux/tracepoint.h>
|
||||
#include <uapi/linux/f2fs.h>
|
||||
|
||||
#define show_dev(dev) MAJOR(dev), MINOR(dev)
|
||||
#define show_dev_ino(entry) show_dev(entry->dev), (unsigned long)entry->ino
|
||||
@ -121,13 +122,15 @@ TRACE_DEFINE_ENUM(CP_RESIZE);
|
||||
|
||||
#define show_alloc_mode(type) \
|
||||
__print_symbolic(type, \
|
||||
{ LFS, "LFS-mode" }, \
|
||||
{ SSR, "SSR-mode" })
|
||||
{ LFS, "LFS-mode" }, \
|
||||
{ SSR, "SSR-mode" }, \
|
||||
{ AT_SSR, "AT_SSR-mode" })
|
||||
|
||||
#define show_victim_policy(type) \
|
||||
__print_symbolic(type, \
|
||||
{ GC_GREEDY, "Greedy" }, \
|
||||
{ GC_CB, "Cost-Benefit" })
|
||||
{ GC_CB, "Cost-Benefit" }, \
|
||||
{ GC_AT, "Age-threshold" })
|
||||
|
||||
#define show_cpreason(type) \
|
||||
__print_flags(type, "|", \
|
||||
@ -546,17 +549,17 @@ TRACE_EVENT(f2fs_truncate_partial_nodes,
|
||||
|
||||
TRACE_EVENT(f2fs_file_write_iter,
|
||||
|
||||
TP_PROTO(struct inode *inode, unsigned long offset,
|
||||
unsigned long length, int ret),
|
||||
TP_PROTO(struct inode *inode, loff_t offset, size_t length,
|
||||
ssize_t ret),
|
||||
|
||||
TP_ARGS(inode, offset, length, ret),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev)
|
||||
__field(ino_t, ino)
|
||||
__field(unsigned long, offset)
|
||||
__field(unsigned long, length)
|
||||
__field(int, ret)
|
||||
__field(loff_t, offset)
|
||||
__field(size_t, length)
|
||||
__field(ssize_t, ret)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
@ -568,7 +571,7 @@ TRACE_EVENT(f2fs_file_write_iter,
|
||||
),
|
||||
|
||||
TP_printk("dev = (%d,%d), ino = %lu, "
|
||||
"offset = %lu, length = %lu, written(err) = %d",
|
||||
"offset = %lld, length = %zu, written(err) = %zd",
|
||||
show_dev_ino(__entry),
|
||||
__entry->offset,
|
||||
__entry->length,
|
||||
@ -576,9 +579,10 @@ TRACE_EVENT(f2fs_file_write_iter,
|
||||
);
|
||||
|
||||
TRACE_EVENT(f2fs_map_blocks,
|
||||
TP_PROTO(struct inode *inode, struct f2fs_map_blocks *map, int ret),
|
||||
TP_PROTO(struct inode *inode, struct f2fs_map_blocks *map,
|
||||
int create, int flag, int ret),
|
||||
|
||||
TP_ARGS(inode, map, ret),
|
||||
TP_ARGS(inode, map, create, flag, ret),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev)
|
||||
@ -589,11 +593,14 @@ TRACE_EVENT(f2fs_map_blocks,
|
||||
__field(unsigned int, m_flags)
|
||||
__field(int, m_seg_type)
|
||||
__field(bool, m_may_create)
|
||||
__field(bool, m_multidev_dio)
|
||||
__field(int, create)
|
||||
__field(int, flag)
|
||||
__field(int, ret)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->dev = inode->i_sb->s_dev;
|
||||
__entry->dev = map->m_bdev->bd_dev;
|
||||
__entry->ino = inode->i_ino;
|
||||
__entry->m_lblk = map->m_lblk;
|
||||
__entry->m_pblk = map->m_pblk;
|
||||
@ -601,12 +608,16 @@ TRACE_EVENT(f2fs_map_blocks,
|
||||
__entry->m_flags = map->m_flags;
|
||||
__entry->m_seg_type = map->m_seg_type;
|
||||
__entry->m_may_create = map->m_may_create;
|
||||
__entry->m_multidev_dio = map->m_multidev_dio;
|
||||
__entry->create = create;
|
||||
__entry->flag = flag;
|
||||
__entry->ret = ret;
|
||||
),
|
||||
|
||||
TP_printk("dev = (%d,%d), ino = %lu, file offset = %llu, "
|
||||
"start blkaddr = 0x%llx, len = 0x%llx, flags = %u,"
|
||||
"seg_type = %d, may_create = %d, err = %d",
|
||||
"start blkaddr = 0x%llx, len = 0x%llx, flags = %u, "
|
||||
"seg_type = %d, may_create = %d, multidevice = %d, "
|
||||
"create = %d, flag = %d, err = %d",
|
||||
show_dev_ino(__entry),
|
||||
(unsigned long long)__entry->m_lblk,
|
||||
(unsigned long long)__entry->m_pblk,
|
||||
@ -614,6 +625,9 @@ TRACE_EVENT(f2fs_map_blocks,
|
||||
__entry->m_flags,
|
||||
__entry->m_seg_type,
|
||||
__entry->m_may_create,
|
||||
__entry->m_multidev_dio,
|
||||
__entry->create,
|
||||
__entry->flag,
|
||||
__entry->ret)
|
||||
);
|
||||
|
||||
@ -813,20 +827,20 @@ TRACE_EVENT(f2fs_lookup_start,
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev)
|
||||
__field(ino_t, ino)
|
||||
__field(const char *, name)
|
||||
__string(name, dentry->d_name.name)
|
||||
__field(unsigned int, flags)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->dev = dir->i_sb->s_dev;
|
||||
__entry->ino = dir->i_ino;
|
||||
__entry->name = dentry->d_name.name;
|
||||
__assign_str(name, dentry->d_name.name);
|
||||
__entry->flags = flags;
|
||||
),
|
||||
|
||||
TP_printk("dev = (%d,%d), pino = %lu, name:%s, flags:%u",
|
||||
show_dev_ino(__entry),
|
||||
__entry->name,
|
||||
__get_str(name),
|
||||
__entry->flags)
|
||||
);
|
||||
|
||||
@ -840,7 +854,7 @@ TRACE_EVENT(f2fs_lookup_end,
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev)
|
||||
__field(ino_t, ino)
|
||||
__field(const char *, name)
|
||||
__string(name, dentry->d_name.name)
|
||||
__field(nid_t, cino)
|
||||
__field(int, err)
|
||||
),
|
||||
@ -848,14 +862,14 @@ TRACE_EVENT(f2fs_lookup_end,
|
||||
TP_fast_assign(
|
||||
__entry->dev = dir->i_sb->s_dev;
|
||||
__entry->ino = dir->i_ino;
|
||||
__entry->name = dentry->d_name.name;
|
||||
__assign_str(name, dentry->d_name.name);
|
||||
__entry->cino = ino;
|
||||
__entry->err = err;
|
||||
),
|
||||
|
||||
TP_printk("dev = (%d,%d), pino = %lu, name:%s, ino:%u, err:%d",
|
||||
show_dev_ino(__entry),
|
||||
__entry->name,
|
||||
__get_str(name),
|
||||
__entry->cino,
|
||||
__entry->err)
|
||||
);
|
||||
@ -1824,6 +1838,7 @@ DEFINE_EVENT(f2fs_zip_end, f2fs_decompress_pages_end,
|
||||
TP_ARGS(inode, cluster_idx, compressed_size, ret)
|
||||
);
|
||||
|
||||
#ifdef CONFIG_F2FS_IOSTAT
|
||||
TRACE_EVENT(f2fs_iostat,
|
||||
|
||||
TP_PROTO(struct f2fs_sb_info *sbi, unsigned long long *iostat),
|
||||
@ -1900,6 +1915,165 @@ TRACE_EVENT(f2fs_iostat,
|
||||
__entry->fs_cdrio, __entry->fs_nrio, __entry->fs_mrio)
|
||||
);
|
||||
|
||||
#ifndef __F2FS_IOSTAT_LATENCY_TYPE
|
||||
#define __F2FS_IOSTAT_LATENCY_TYPE
|
||||
struct f2fs_iostat_latency {
|
||||
unsigned int peak_lat;
|
||||
unsigned int avg_lat;
|
||||
unsigned int cnt;
|
||||
};
|
||||
#endif /* __F2FS_IOSTAT_LATENCY_TYPE */
|
||||
|
||||
TRACE_EVENT(f2fs_iostat_latency,
|
||||
|
||||
TP_PROTO(struct f2fs_sb_info *sbi, struct f2fs_iostat_latency (*iostat_lat)[NR_PAGE_TYPE]),
|
||||
|
||||
TP_ARGS(sbi, iostat_lat),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev)
|
||||
__field(unsigned int, d_rd_peak)
|
||||
__field(unsigned int, d_rd_avg)
|
||||
__field(unsigned int, d_rd_cnt)
|
||||
__field(unsigned int, n_rd_peak)
|
||||
__field(unsigned int, n_rd_avg)
|
||||
__field(unsigned int, n_rd_cnt)
|
||||
__field(unsigned int, m_rd_peak)
|
||||
__field(unsigned int, m_rd_avg)
|
||||
__field(unsigned int, m_rd_cnt)
|
||||
__field(unsigned int, d_wr_s_peak)
|
||||
__field(unsigned int, d_wr_s_avg)
|
||||
__field(unsigned int, d_wr_s_cnt)
|
||||
__field(unsigned int, n_wr_s_peak)
|
||||
__field(unsigned int, n_wr_s_avg)
|
||||
__field(unsigned int, n_wr_s_cnt)
|
||||
__field(unsigned int, m_wr_s_peak)
|
||||
__field(unsigned int, m_wr_s_avg)
|
||||
__field(unsigned int, m_wr_s_cnt)
|
||||
__field(unsigned int, d_wr_as_peak)
|
||||
__field(unsigned int, d_wr_as_avg)
|
||||
__field(unsigned int, d_wr_as_cnt)
|
||||
__field(unsigned int, n_wr_as_peak)
|
||||
__field(unsigned int, n_wr_as_avg)
|
||||
__field(unsigned int, n_wr_as_cnt)
|
||||
__field(unsigned int, m_wr_as_peak)
|
||||
__field(unsigned int, m_wr_as_avg)
|
||||
__field(unsigned int, m_wr_as_cnt)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->dev = sbi->sb->s_dev;
|
||||
__entry->d_rd_peak = iostat_lat[0][DATA].peak_lat;
|
||||
__entry->d_rd_avg = iostat_lat[0][DATA].avg_lat;
|
||||
__entry->d_rd_cnt = iostat_lat[0][DATA].cnt;
|
||||
__entry->n_rd_peak = iostat_lat[0][NODE].peak_lat;
|
||||
__entry->n_rd_avg = iostat_lat[0][NODE].avg_lat;
|
||||
__entry->n_rd_cnt = iostat_lat[0][NODE].cnt;
|
||||
__entry->m_rd_peak = iostat_lat[0][META].peak_lat;
|
||||
__entry->m_rd_avg = iostat_lat[0][META].avg_lat;
|
||||
__entry->m_rd_cnt = iostat_lat[0][META].cnt;
|
||||
__entry->d_wr_s_peak = iostat_lat[1][DATA].peak_lat;
|
||||
__entry->d_wr_s_avg = iostat_lat[1][DATA].avg_lat;
|
||||
__entry->d_wr_s_cnt = iostat_lat[1][DATA].cnt;
|
||||
__entry->n_wr_s_peak = iostat_lat[1][NODE].peak_lat;
|
||||
__entry->n_wr_s_avg = iostat_lat[1][NODE].avg_lat;
|
||||
__entry->n_wr_s_cnt = iostat_lat[1][NODE].cnt;
|
||||
__entry->m_wr_s_peak = iostat_lat[1][META].peak_lat;
|
||||
__entry->m_wr_s_avg = iostat_lat[1][META].avg_lat;
|
||||
__entry->m_wr_s_cnt = iostat_lat[1][META].cnt;
|
||||
__entry->d_wr_as_peak = iostat_lat[2][DATA].peak_lat;
|
||||
__entry->d_wr_as_avg = iostat_lat[2][DATA].avg_lat;
|
||||
__entry->d_wr_as_cnt = iostat_lat[2][DATA].cnt;
|
||||
__entry->n_wr_as_peak = iostat_lat[2][NODE].peak_lat;
|
||||
__entry->n_wr_as_avg = iostat_lat[2][NODE].avg_lat;
|
||||
__entry->n_wr_as_cnt = iostat_lat[2][NODE].cnt;
|
||||
__entry->m_wr_as_peak = iostat_lat[2][META].peak_lat;
|
||||
__entry->m_wr_as_avg = iostat_lat[2][META].avg_lat;
|
||||
__entry->m_wr_as_cnt = iostat_lat[2][META].cnt;
|
||||
),
|
||||
|
||||
TP_printk("dev = (%d,%d), "
|
||||
"iotype [peak lat.(ms)/avg lat.(ms)/count], "
|
||||
"rd_data [%u/%u/%u], rd_node [%u/%u/%u], rd_meta [%u/%u/%u], "
|
||||
"wr_sync_data [%u/%u/%u], wr_sync_node [%u/%u/%u], "
|
||||
"wr_sync_meta [%u/%u/%u], wr_async_data [%u/%u/%u], "
|
||||
"wr_async_node [%u/%u/%u], wr_async_meta [%u/%u/%u]",
|
||||
show_dev(__entry->dev),
|
||||
__entry->d_rd_peak, __entry->d_rd_avg, __entry->d_rd_cnt,
|
||||
__entry->n_rd_peak, __entry->n_rd_avg, __entry->n_rd_cnt,
|
||||
__entry->m_rd_peak, __entry->m_rd_avg, __entry->m_rd_cnt,
|
||||
__entry->d_wr_s_peak, __entry->d_wr_s_avg, __entry->d_wr_s_cnt,
|
||||
__entry->n_wr_s_peak, __entry->n_wr_s_avg, __entry->n_wr_s_cnt,
|
||||
__entry->m_wr_s_peak, __entry->m_wr_s_avg, __entry->m_wr_s_cnt,
|
||||
__entry->d_wr_as_peak, __entry->d_wr_as_avg, __entry->d_wr_as_cnt,
|
||||
__entry->n_wr_as_peak, __entry->n_wr_as_avg, __entry->n_wr_as_cnt,
|
||||
__entry->m_wr_as_peak, __entry->m_wr_as_avg, __entry->m_wr_as_cnt)
|
||||
);
|
||||
#endif
|
||||
|
||||
TRACE_EVENT(f2fs_bmap,
|
||||
|
||||
TP_PROTO(struct inode *inode, sector_t lblock, sector_t pblock),
|
||||
|
||||
TP_ARGS(inode, lblock, pblock),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev)
|
||||
__field(ino_t, ino)
|
||||
__field(sector_t, lblock)
|
||||
__field(sector_t, pblock)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->dev = inode->i_sb->s_dev;
|
||||
__entry->ino = inode->i_ino;
|
||||
__entry->lblock = lblock;
|
||||
__entry->pblock = pblock;
|
||||
),
|
||||
|
||||
TP_printk("dev = (%d,%d), ino = %lu, lblock:%lld, pblock:%lld",
|
||||
show_dev_ino(__entry),
|
||||
(unsigned long long)__entry->lblock,
|
||||
(unsigned long long)__entry->pblock)
|
||||
);
|
||||
|
||||
TRACE_EVENT(f2fs_fiemap,
|
||||
|
||||
TP_PROTO(struct inode *inode, sector_t lblock, sector_t pblock,
|
||||
unsigned long long len, unsigned int flags, int ret),
|
||||
|
||||
TP_ARGS(inode, lblock, pblock, len, flags, ret),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev)
|
||||
__field(ino_t, ino)
|
||||
__field(sector_t, lblock)
|
||||
__field(sector_t, pblock)
|
||||
__field(unsigned long long, len)
|
||||
__field(unsigned int, flags)
|
||||
__field(int, ret)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->dev = inode->i_sb->s_dev;
|
||||
__entry->ino = inode->i_ino;
|
||||
__entry->lblock = lblock;
|
||||
__entry->pblock = pblock;
|
||||
__entry->len = len;
|
||||
__entry->flags = flags;
|
||||
__entry->ret = ret;
|
||||
),
|
||||
|
||||
TP_printk("dev = (%d,%d), ino = %lu, lblock:%lld, pblock:%lld, "
|
||||
"len:%llu, flags:%u, ret:%d",
|
||||
show_dev_ino(__entry),
|
||||
(unsigned long long)__entry->lblock,
|
||||
(unsigned long long)__entry->pblock,
|
||||
__entry->len,
|
||||
__entry->flags,
|
||||
__entry->ret)
|
||||
);
|
||||
|
||||
#endif /* _TRACE_F2FS_H */
|
||||
|
||||
/* This part must be outside protection */
|
||||
|
98
include/uapi/linux/f2fs.h
Normal file
98
include/uapi/linux/f2fs.h
Normal file
@ -0,0 +1,98 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
|
||||
|
||||
#ifndef _UAPI_LINUX_F2FS_H
|
||||
#define _UAPI_LINUX_F2FS_H
|
||||
#include <linux/types.h>
|
||||
#include <linux/ioctl.h>
|
||||
|
||||
/*
|
||||
* f2fs-specific ioctl commands
|
||||
*/
|
||||
#define F2FS_IOCTL_MAGIC 0xf5
|
||||
#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1)
|
||||
#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2)
|
||||
#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3)
|
||||
#define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4)
|
||||
#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5)
|
||||
#define F2FS_IOC_GARBAGE_COLLECT _IOW(F2FS_IOCTL_MAGIC, 6, __u32)
|
||||
#define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7)
|
||||
#define F2FS_IOC_DEFRAGMENT _IOWR(F2FS_IOCTL_MAGIC, 8, \
|
||||
struct f2fs_defragment)
|
||||
#define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \
|
||||
struct f2fs_move_range)
|
||||
#define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \
|
||||
struct f2fs_flush_device)
|
||||
#define F2FS_IOC_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11, \
|
||||
struct f2fs_gc_range)
|
||||
#define F2FS_IOC_GET_FEATURES _IOR(F2FS_IOCTL_MAGIC, 12, __u32)
|
||||
#define F2FS_IOC_SET_PIN_FILE _IOW(F2FS_IOCTL_MAGIC, 13, __u32)
|
||||
#define F2FS_IOC_GET_PIN_FILE _IOR(F2FS_IOCTL_MAGIC, 14, __u32)
|
||||
#define F2FS_IOC_PRECACHE_EXTENTS _IO(F2FS_IOCTL_MAGIC, 15)
|
||||
#define F2FS_IOC_RESIZE_FS _IOW(F2FS_IOCTL_MAGIC, 16, __u64)
|
||||
#define F2FS_IOC_GET_COMPRESS_BLOCKS _IOR(F2FS_IOCTL_MAGIC, 17, __u64)
|
||||
#define F2FS_IOC_RELEASE_COMPRESS_BLOCKS \
|
||||
_IOR(F2FS_IOCTL_MAGIC, 18, __u64)
|
||||
#define F2FS_IOC_RESERVE_COMPRESS_BLOCKS \
|
||||
_IOR(F2FS_IOCTL_MAGIC, 19, __u64)
|
||||
#define F2FS_IOC_SEC_TRIM_FILE _IOW(F2FS_IOCTL_MAGIC, 20, \
|
||||
struct f2fs_sectrim_range)
|
||||
#define F2FS_IOC_GET_COMPRESS_OPTION _IOR(F2FS_IOCTL_MAGIC, 21, \
|
||||
struct f2fs_comp_option)
|
||||
#define F2FS_IOC_SET_COMPRESS_OPTION _IOW(F2FS_IOCTL_MAGIC, 22, \
|
||||
struct f2fs_comp_option)
|
||||
#define F2FS_IOC_DECOMPRESS_FILE _IO(F2FS_IOCTL_MAGIC, 23)
|
||||
#define F2FS_IOC_COMPRESS_FILE _IO(F2FS_IOCTL_MAGIC, 24)
|
||||
|
||||
/*
|
||||
* should be same as XFS_IOC_GOINGDOWN.
|
||||
* Flags for going down operation used by FS_IOC_GOINGDOWN
|
||||
*/
|
||||
#define F2FS_IOC_SHUTDOWN _IOR('X', 125, __u32) /* Shutdown */
|
||||
#define F2FS_GOING_DOWN_FULLSYNC 0x0 /* going down with full sync */
|
||||
#define F2FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */
|
||||
#define F2FS_GOING_DOWN_NOSYNC 0x2 /* going down */
|
||||
#define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */
|
||||
#define F2FS_GOING_DOWN_NEED_FSCK 0x4 /* going down to trigger fsck */
|
||||
|
||||
/*
|
||||
* Flags used by F2FS_IOC_SEC_TRIM_FILE
|
||||
*/
|
||||
#define F2FS_TRIM_FILE_DISCARD 0x1 /* send discard command */
|
||||
#define F2FS_TRIM_FILE_ZEROOUT 0x2 /* zero out */
|
||||
#define F2FS_TRIM_FILE_MASK 0x3
|
||||
|
||||
struct f2fs_gc_range {
|
||||
__u32 sync;
|
||||
__u64 start;
|
||||
__u64 len;
|
||||
};
|
||||
|
||||
struct f2fs_defragment {
|
||||
__u64 start;
|
||||
__u64 len;
|
||||
};
|
||||
|
||||
struct f2fs_move_range {
|
||||
__u32 dst_fd; /* destination fd */
|
||||
__u64 pos_in; /* start position in src_fd */
|
||||
__u64 pos_out; /* start position in dst_fd */
|
||||
__u64 len; /* size to move */
|
||||
};
|
||||
|
||||
struct f2fs_flush_device {
|
||||
__u32 dev_num; /* device number to flush */
|
||||
__u32 segments; /* # of segments to flush */
|
||||
};
|
||||
|
||||
struct f2fs_sectrim_range {
|
||||
__u64 start;
|
||||
__u64 len;
|
||||
__u64 flags;
|
||||
};
|
||||
|
||||
struct f2fs_comp_option {
|
||||
__u8 algorithm;
|
||||
__u8 log_cluster_size;
|
||||
};
|
||||
|
||||
#endif /* _UAPI_LINUX_F2FS_H */
|
@ -34,7 +34,70 @@ struct fsverity_digest {
|
||||
__u8 digest[];
|
||||
};
|
||||
|
||||
/*
|
||||
* Struct containing a file's Merkle tree properties. The fs-verity file digest
|
||||
* is the hash of this struct. A userspace program needs this struct only if it
|
||||
* needs to compute fs-verity file digests itself, e.g. in order to sign files.
|
||||
* It isn't needed just to enable fs-verity on a file.
|
||||
*
|
||||
* Note: when computing the file digest, 'sig_size' and 'signature' must be left
|
||||
* zero and empty, respectively. These fields are present only because some
|
||||
* filesystems reuse this struct as part of their on-disk format.
|
||||
*/
|
||||
struct fsverity_descriptor {
|
||||
__u8 version; /* must be 1 */
|
||||
__u8 hash_algorithm; /* Merkle tree hash algorithm */
|
||||
__u8 log_blocksize; /* log2 of size of data and tree blocks */
|
||||
__u8 salt_size; /* size of salt in bytes; 0 if none */
|
||||
#ifdef __KERNEL__
|
||||
__le32 sig_size;
|
||||
#else
|
||||
__le32 __reserved_0x04; /* must be 0 */
|
||||
#endif
|
||||
__le64 data_size; /* size of file the Merkle tree is built over */
|
||||
__u8 root_hash[64]; /* Merkle tree root hash */
|
||||
__u8 salt[32]; /* salt prepended to each hashed block */
|
||||
__u8 __reserved[144]; /* must be 0's */
|
||||
#ifdef __KERNEL__
|
||||
__u8 signature[];
|
||||
#endif
|
||||
};
|
||||
|
||||
/*
|
||||
* Format in which fs-verity file digests are signed in built-in signatures.
|
||||
* This is the same as 'struct fsverity_digest', except here some magic bytes
|
||||
* are prepended to provide some context about what is being signed in case the
|
||||
* same key is used for non-fsverity purposes, and here the fields have fixed
|
||||
* endianness.
|
||||
*
|
||||
* This struct is specific to the built-in signature verification support, which
|
||||
* is optional. fs-verity users may also verify signatures in userspace, in
|
||||
* which case userspace is responsible for deciding on what bytes are signed.
|
||||
* This struct may still be used, but it doesn't have to be. For example,
|
||||
* userspace could instead use a string like "sha256:$digest_as_hex_string".
|
||||
*/
|
||||
struct fsverity_formatted_digest {
|
||||
char magic[8]; /* must be "FSVerity" */
|
||||
__le16 digest_algorithm;
|
||||
__le16 digest_size;
|
||||
__u8 digest[];
|
||||
};
|
||||
|
||||
#define FS_VERITY_METADATA_TYPE_MERKLE_TREE 1
|
||||
#define FS_VERITY_METADATA_TYPE_DESCRIPTOR 2
|
||||
#define FS_VERITY_METADATA_TYPE_SIGNATURE 3
|
||||
|
||||
struct fsverity_read_metadata_arg {
|
||||
__u64 metadata_type;
|
||||
__u64 offset;
|
||||
__u64 length;
|
||||
__u64 buf_ptr;
|
||||
__u64 __reserved;
|
||||
};
|
||||
|
||||
#define FS_IOC_ENABLE_VERITY _IOW('f', 133, struct fsverity_enable_arg)
|
||||
#define FS_IOC_MEASURE_VERITY _IOWR('f', 134, struct fsverity_digest)
|
||||
#define FS_IOC_READ_VERITY_METADATA \
|
||||
_IOWR('f', 135, struct fsverity_read_metadata_arg)
|
||||
|
||||
#endif /* _UAPI_LINUX_FSVERITY_H */
|
||||
|
11
mm/page_io.c
11
mm/page_io.c
@ -179,8 +179,9 @@ int generic_swapfile_activate(struct swap_info_struct *sis,
|
||||
|
||||
cond_resched();
|
||||
|
||||
first_block = bmap(inode, probe_block);
|
||||
if (first_block == 0)
|
||||
first_block = probe_block;
|
||||
ret = bmap(inode, &first_block);
|
||||
if (ret || !first_block)
|
||||
goto bad_bmap;
|
||||
|
||||
/*
|
||||
@ -195,9 +196,11 @@ int generic_swapfile_activate(struct swap_info_struct *sis,
|
||||
block_in_page++) {
|
||||
sector_t block;
|
||||
|
||||
block = bmap(inode, probe_block + block_in_page);
|
||||
if (block == 0)
|
||||
block = probe_block + block_in_page;
|
||||
ret = bmap(inode, &block);
|
||||
if (ret || !block)
|
||||
goto bad_bmap;
|
||||
|
||||
if (block != first_block + block_in_page) {
|
||||
/* Discontiguity */
|
||||
probe_block++;
|
||||
|
Loading…
x
Reference in New Issue
Block a user