Merge remote-tracking branch 'google/upstream-f2fs-stable-linux-4.14.y' into sheesh

* google/upstream-f2fs-stable-linux-4.14.y:
  f2fs: fix to do sanity check on .cp_pack_total_block_count
  f2fs: make gc_urgent and gc_segment_mode sysfs node readable
  f2fs: use aggressive GC policy during f2fs_disable_checkpoint()
  f2fs: fix compressed file start atomic write may cause data corruption
  f2fs: initialize sbi->gc_mode explicitly
  f2fs: introduce gc_urgent_mid mode
  f2fs: compress: fix to print raw data size in error path of lz4 decompression
  f2fs: remove redundant parameter judgment
  f2fs: use spin_lock to avoid hang
  f2fs: don't get FREEZE lock in f2fs_evict_inode in frozen fs
  f2fs: remove unnecessary read for F2FS_FITS_IN_INODE
  f2fs: fix to do sanity check on curseg->alloc_type
  f2fs: fix to avoid potential deadlock
  f2fs: quota: fix loop condition at f2fs_quota_sync()
  f2fs: Restore rwsem lockdep support
  f2fs: fix missing free nid in f2fs_handle_failed_inode
  f2fs: add a way to limit roll forward recovery time
  f2fs: introduce F2FS_IPU_HONOR_OPU_WRITE ipu policy
  f2fs: adjust readahead block number during recovery
  f2fs: fix to unlock page correctly in error path of is_alive()
  f2fs: expose discard related parameters in sysfs
  f2fs: move discard parameters into discard_cmd_control
  f2fs: fix to enable ATGC correctly via gc_idle sysfs interface
  f2fs: move f2fs to use reader-unfair rwsems
  f2fs: do not allow partial truncation on pinned file
  f2fs: remove redunant invalidate compress pages
  f2fs: Simplify bool conversion
  f2fs: don't drop compressed page cache in .{invalidate,release}page
  f2fs: fix to reserve space for IO align feature
  f2fs: fix to check available space of CP area correctly in update_ckpt_flags()
  f2fs: support fault injection to f2fs_trylock_op()
  f2fs: clean up __find_inline_xattr() with __find_xattr()
  f2fs: fix to do sanity check on last xattr entry in __f2fs_setxattr()
  f2fs: do not bother checkpoint by f2fs_get_node_info
  f2fs: avoid down_write on nat_tree_lock during checkpoint
  f2fs: compress: fix potential deadlock of compress file
  f2fs: avoid EINVAL by SBI_NEED_FSCK when pinning a file
  f2fs: add gc_urgent_high_remaining sysfs node
  f2fs: fix to do sanity check in is_alive()
  f2fs: fix to avoid panic in is_alive() if metadata is inconsistent
  f2fs: fix to do sanity check on inode type during garbage collection
  f2fs: avoid duplicate call of mark_inode_dirty
  f2fs: fix remove page failed in invalidate compress pages
  f2fs: fix the f2fs_file_write_iter tracepoint
  f2fs: do not expose unwritten blocks to user by DIO
  f2fs: reduce indentation in f2fs_file_write_iter()
  f2fs: rework write preallocations
  f2fs: compress: reduce one page array alloc and free when write compressed page
  f2fs: show number of pending discard commands
  f2fs: check nr_pages for readahead
  f2fs: fix UAF in f2fs_available_free_memory
  f2fs: invalidate META_MAPPING before IPU/DIO write
  f2fs: support fault injection for dquot_initialize()
  f2fs: fix incorrect return value in f2fs_sanity_check_ckpt()
  f2fs: compress: disallow disabling compress on non-empty compressed file
  f2fs: compress: fix overwrite may reduce compress ratio unproperly
  f2fs: multidevice: support direct IO
  f2fs: introduce fragment allocation mode mount option
  f2fs: include non-compressed blocks in compr_written_block
  f2fs: fix wrong condition to trigger background checkpoint correctly
  f2fs: fix to use WHINT_MODE
  f2fs: fix up f2fs_lookup tracepoints
  f2fs: set SBI_NEED_FSCK flag when inconsistent node block found
  f2fs: introduce excess_dirty_threshold()
  f2fs: avoid attaching SB_ACTIVE flag during mount
  f2fs: quota: fix potential deadlock
  f2fs: should use GFP_NOFS for directory inodes
  f2fs: should put a page beyond EOF when preparing a write
  f2fs: deallocate compressed pages when error happens
  f2fs: enable realtime discard iff device supports discard
  f2fs: guarantee to write dirty data when enabling checkpoint back
  f2fs: fix to unmap pages from userspace process in punch_hole()
  f2fs: fix unexpected ENOENT comes from f2fs_map_blocks()
  f2fs: fix to account missing .skipped_gc_rwsem
  f2fs: adjust unlock order for cleanup
  f2fs: Don't create discard thread when device doesn't support realtime discard
  f2fs: rebuild nat_bits during umount
  f2fs: introduce periodic iostat io latency traces
  f2fs: separate out iostat feature
  f2fs: compress: do sanity check on cluster
  f2fs: fix description about main_blkaddr node
  f2fs: convert S_IRUGO to 0444
  f2fs: fix to keep compatibility of fault injection interface
  f2fs: support fault injection for f2fs_kmem_cache_alloc()
  f2fs: compress: allow write compress released file after truncate to zero
  f2fs: correct comment in segment.h
  f2fs: improve sbi status info in debugfs/f2fs/status
  f2fs: compress: avoid duplicate counting of valid blocks when read compressed file
  f2fs: fix to do sanity check for sb/cp fields correctly
  f2fs: avoid unneeded memory allocation in __add_ino_entry()
  f2fs: extent cache: support unaligned extent
  f2fs: Kconfig: clean up config options about compression
  f2fs: reduce the scope of setting fsck tag when de->name_len is zero
  f2fs: fix to stop filesystem update once CP failed
  f2fs: introduce discard_unit mount option
  f2fs: fix min_seq_blocks can not make sense in some scenes.
  f2fs: fix to force keeping write barrier for strict fsync mode
  f2fs: fix wrong checkpoint_changed value in f2fs_remount()
  f2fs: show sbi status in debugfs/f2fs/status
  f2fs: turn back remapped address in compressed page endio
  f2fs: change fiemap way in printing compression chunk
  f2fs: do not submit NEW_ADDR to read node block
  f2fs: compress: remove unneeded read when rewrite whole cluster
  f2fs: don't sleep while grabing nat_tree_lock
  f2fs: remove allow_outplace_dio()
  f2fs: make f2fs_write_failed() take struct inode
  f2fs: quota: fix potential deadlock
  f2fs: let's keep writing IOs on SBI_NEED_FSCK
  f2fs: Revert "f2fs: Fix indefinite loop in f2fs_gc() v1"
  f2fs: avoid to create an empty string as the extension_list
  f2fs: compress: fix to set zstd compress level correctly
  f2fs: add sysfs nodes to get GC info for each GC mode
  f2fs: drop dirty node pages when cp is in error status
  f2fs: initialize page->private when using for our internal use
  f2fs: compress: add nocompress extensions support
  Revert "f2fs: avoid attaching SB_ACTIVE flag during mount/remount"
  f2fs: remove false alarm on iget failure during GC
  f2fs: enable extent cache for compression files in read-only
  f2fs: fix to avoid adding tab before doc section
  f2fs: introduce f2fs_casefolded_name slab cache
  f2fs: swap: support migrating swapfile in aligned write mode
  f2fs: swap: remove dead codes
  f2fs: compress: add compress_inode to cache compressed blocks
  f2fs: clean up /sys/fs/f2fs/<disk>/features
  f2fs: add pin_file in feature list
  f2fs: Advertise encrypted casefolding in sysfs
  f2fs: Show casefolding support only when supported
  f2fs: support RO feature
  f2fs: logging neatening
  f2fs: restructure f2fs page.private layout
  f2fs: introduce FI_COMPRESS_RELEASED instead of using IMMUTABLE bit
  f2fs: compress: remove unneeded preallocation
  f2fs: avoid attaching SB_ACTIVE flag during mount/remount
  f2fs: atgc: export entries for better tunability via sysfs
  f2fs: compress: fix to disallow temp extension
  f2fs: let's allow compression for mmap files
  f2fs: add MODULE_SOFTDEP to ensure crc32 is included in the initramfs
  f2fs: return success if there is no work to do
  f2fs: compress: clean up parameter of __f2fs_cluster_blocks()
  f2fs: compress: remove unneeded f2fs_put_dnode()
  f2fs: atgc: fix to set default age threshold
  f2fs: Prevent swap file in LFS mode
  f2fs: fix to avoid racing on fsync_entry_slab by multi filesystem instances
  f2fs: add cp_error check in f2fs_write_compressed_pages
  f2fs: compress: rename __cluster_may_compress
  f2fs: return EINVAL for hole cases in swap file
  f2fs: avoid swapon failure by giving a warning first
  f2fs: compress: fix to assign cc.cluster_idx correctly
  f2fs: compress: fix race condition of overwrite vs truncate
  f2fs: compress: fix to free compress page correctly
  f2fs: support iflag change given the mask
  f2fs: avoid null pointer access when handling IPU error
  f2fs: drop inplace IO if fs status is abnormal
  f2fs: compress: remove unneed check condition
  f2fs: clean up left deprecated IO trace codes
  f2fs: avoid using native allocate_segment_by_default()
  f2fs: remove unnecessary struct declaration
  f2fs: fix to avoid NULL pointer dereference
  f2fs: avoid duplicated codes for cleanup
  f2fs: document: add description about compressed space handling
  f2fs: clean up build warnings
  f2fs: fix the periodic wakeups of discard thread
  f2fs: fix to avoid accessing invalid fio in f2fs_allocate_data_block()
  f2fs: fix to avoid GC/mmap race with f2fs_truncate()
  f2fs: set checkpoint_merge by default
  f2fs: Fix a hungtask problem in atomic write
  f2fs: fix to restrict mount condition on readonly block device
  f2fs: introduce gc_merge mount option
  f2fs: fix to cover __allocate_new_section() with curseg_lock
  f2fs: fix wrong alloc_type in f2fs_do_replace_block
  f2fs: delete empty compress.h
  f2fs: fix a typo in inode.c
  f2fs: allow to change discard policy based on cached discard cmds
  f2fs: fix to avoid touching checkpointed data in get_victim()
  f2fs: fix to update last i_size if fallocate partially succeeds
  f2fs: fix error path of f2fs_remount()
  f2fs: fix wrong comment of nat_tree_lock
  f2fs: fix to avoid out-of-bounds memory access
  f2fs: don't start checkpoint thread in readonly mountpoint
  f2fs: do not use AT_SSR mode in FG_GC & high urgent BG_GC
  f2fs: add sysfs nodes to get runtime compression stat
  f2fs: fix to use per-inode maxbytes in f2fs_fiemap
  f2fs: fix to align to section for fallocate() on pinned file
  f2fs: expose # of overprivision segments
  f2fs: fix error handling in f2fs_end_enable_verity()
  f2fs: fix a redundant call to f2fs_balance_fs if an error occurs
  f2fs: remove unused file_clear_encrypt()
  f2fs: check if swapfile is section-alligned
  f2fs: fix last_lblock check in check_swap_activate_fast
  f2fs: remove unnecessary IS_SWAPFILE check
  f2fs: Replace one-element array with flexible-array member
  f2fs: compress: Allow modular (de)compression algorithms
  f2fs: check discard command number before traversing discard pending list
  f2fs: update comments for explicit memory barrier
  f2fs: remove unused FORCE_FG_GC macro
  f2fs: avoid unused f2fs_show_compress_options()
  f2fs: fix panic during f2fs_resize_fs()
  f2fs: fix to allow migrating fully valid segment
  f2fs: fix a spelling error
  f2fs: fix a spacing coding style
  fs: Enable bmap() function to properly return errors
  f2fs: remove obsolete f2fs.txt
  fs-verity: support reading signature with ioctl
  fs-verity: support reading descriptor with ioctl
  fs-verity: support reading Merkle tree with ioctl
  fs-verity: add FS_IOC_READ_VERITY_METADATA ioctl
  fs-verity: don't pass whole descriptor to fsverity_verify_signature()
  fs-verity: factor out fsverity_get_descriptor()
  fs-verity: move structs needed for file signing to UAPI header
  fs-verity: rename "file measurement" to "file digest"
  fs-verity: rename fsverity_signed_digest to fsverity_formatted_digest
  fs-verity: remove filenames from file comments
  fs-verity: use smp_load_acquire() for ->i_verity_info
  f2fs: remove FAULT_ALLOC_BIO
  f2fs: use blkdev_issue_flush in __submit_flush_wait
  f2fs: remove a few bd_part checks
  quota: Cleanup list iteration in dqcache_shrink_scan()
  quota: reclaim least recently used dquots
  fs: quota: Replace GFP_ATOMIC with GFP_KERNEL in dquot_init
  quota: Check for register_shrinker() failure.
  quota: propagate error from __dquot_initialize
  quota: be aware of error from dquot_initialize
  Documentation: f2fs: fix typo s/automaic/automatic
  f2fs: give a warning only for readonly partition
  f2fs: don't grab superblock freeze for flush/ckpt thread
  f2fs: add ckpt_thread_ioprio sysfs node
  f2fs: introduce checkpoint_merge mount option
  f2fs: relocate inline conversion from mmap() to mkwrite()
  f2fs: fix a wrong condition in __submit_bio
  f2fs: remove unnecessary initialization in xattr.c
  f2fs: fix to avoid inconsistent quota data
  f2fs: flush data when enabling checkpoint back
  f2fs: deprecate f2fs_trace_io
  f2fs: remove unused stat_{inc, dec}_atomic_write
  f2fs: introduce sb_status sysfs node
  f2fs: fix to use per-inode maxbytes
  f2fs: compress: fix potential deadlock
  libfs: unexport generic_ci_d_compare() and generic_ci_d_hash()
  f2fs: fix to set/clear I_LINKABLE under i_lock
  f2fs: fix null page reference in redirty_blocks
  f2fs: clean up post-read processing
  f2fs: trival cleanup in move_data_block()
  f2fs: fix out-of-repair __setattr_copy()
  f2fs: fix to tag FIEMAP_EXTENT_MERGED in f2fs_fiemap()
  f2fs: introduce a new per-sb directory in sysfs
  f2fs: compress: support compress level
  f2fs: compress: deny setting unsupported compress algorithm
  f2fs: relocate f2fs_precache_extents()
  f2fs: enforce the immutable flag on open files
  f2fs: enhance to update i_mode and acl atomically in f2fs_setattr()
  f2fs: fix to set inode->i_mode correctly for posix_acl_update_mode
  f2fs: Replace expression with offsetof()
  f2fs: handle unallocated section and zone on pinned/atgc
  f2fs: compress: fix compression chksum
  f2fs: fix shift-out-of-bounds in sanity_check_raw_super()
  f2fs: fix race of pending_pages in decompression
  f2fs: fix to account inline xattr correctly during recovery
  f2fs: inline: fix wrong inline inode stat
  f2fs: inline: correct comment in f2fs_recover_inline_data
  f2fs: don't check PAGE_SIZE again in sanity_check_raw_super()
  f2fs: convert to F2FS_*_INO macro
  f2fs: introduce max_io_bytes, a sysfs entry, to limit bio size
  f2fs: don't allow any writes on readonly mount
  f2fs: avoid race condition for shrinker count
  f2fs: add F2FS_IOC_DECOMPRESS_FILE and F2FS_IOC_COMPRESS_FILE
  f2fs: add compress_mode mount option
  f2fs: Remove unnecessary unlikely()
  f2fs: init dirty_secmap incorrectly
  f2fs: remove buffer_head which has 32bits limit
  f2fs: fix wrong block count instead of bytes
  f2fs: use new conversion functions between blks and bytes
  f2fs: rename logical_to_blk and blk_to_logical
  f2fs: fix kbytes written stat for multi-device case
  f2fs: compress: support chksum
  f2fs: fix to avoid REQ_TIME and CP_TIME collision
  f2fs: change to use rwsem for cp_mutex
  f2fs: Handle casefolding with Encryption
  fscrypt: Have filesystems handle their d_ops
  libfs: Add generic function for setting dentry_ops
  f2fs: Remove the redundancy initialization
  f2fs: remove writeback_inodes_sb in f2fs_remount
  f2fs: fix double free of unicode map
  f2fs: fix compat F2FS_IOC_{MOVE,GARBAGE_COLLECT}_RANGE
  f2fs: avoid unneeded data copy in f2fs_ioc_move_range()
  f2fs: add F2FS_IOC_SET_COMPRESS_OPTION ioctl
  f2fs: add F2FS_IOC_GET_COMPRESS_OPTION ioctl
  f2fs: move ioctl interface definitions to separated file
  f2fs: fix to seek incorrect data offset in inline data file
  f2fs: check fiemap parameters
  f2fs: call f2fs_get_meta_page_retry for nat page
  fscrypt: rename DCACHE_ENCRYPTED_NAME to DCACHE_NOKEY_NAME
  fscrypt: don't call no-key names "ciphertext names"
  fscrypt: export fscrypt_d_revalidate()
  f2fs: code cleanup by removing unnecessary check
  f2fs: wait for sysfs kobject removal before freeing f2fs_sb_info
  f2fs: fix writecount false positive in releasing compress blocks
  f2fs: introduce check_swap_activate_fast()
  f2fs: don't issue flush in f2fs_flush_device_cache() for nobarrier case
  f2fs: handle errors of f2fs_get_meta_page_nofail
  f2fs: fix to set SBI_NEED_FSCK flag for inconsistent inode
  f2fs: reject CASEFOLD inode flag without casefold feature
  f2fs: fix memory alignment to support 32bit
  f2fs: fix slab leak of rpages pointer
  f2fs: compress: fix to disallow enabling compress on non-empty file
  f2fs: compress: introduce cic/dic slab cache
  f2fs: compress: introduce page array slab cache
  f2fs: fix to do sanity check on segment/section count
  f2fs: fix to check segment boundary during SIT page readahead
  f2fs: fix uninit-value in f2fs_lookup
  fs/buffer.c: record blockdev write errors in super_block that it backs
  vfs: track per-sb writeback errors and report them to syncfs
  f2fs: remove unneeded parameter in find_in_block()
  f2fs: fix wrong total_sections check and fsmeta check
  f2fs: remove duplicated code in sanity_check_area_boundary
  f2fs: remove unused check on version_bitmap
  f2fs: relocate blkzoned feature check
  f2fs: do sanity check on zoned block device path
  f2fs: add trace exit in exception path
  f2fs: change return value of reserved_segments to unsigned int
  f2fs: clean up kvfree
  f2fs: change virtual mapping way for compression pages
  f2fs: change return value of f2fs_disable_compressed_file to bool
  f2fs: change i_compr_blocks of inode to atomic value
  f2fs: ignore compress mount option on image w/o compression feature
  f2fs: allocate proper size memory for zstd decompress
  f2fs: change compr_blocks of superblock info to 64bit
  f2fs: add block address limit check to compressed file
  f2fs: check position in move range ioctl
  f2fs: correct statistic of APP_DIRECT_IO/APP_DIRECT_READ_IO
  f2fs: support age threshold based garbage collection
  f2fs: Use generic casefolding support
  fs: Add standard casefolding support
  unicode: Add utf8_casefold_hash
  f2fs: compress: use more readable atomic_t type for {cic,dic}.ref
  f2fs: fix compile warning
  f2fs: support 64-bits key in f2fs rb-tree node entry
  f2fs: inherit mtime of original block during GC
  f2fs: record average update time of segment
  f2fs: introduce inmem curseg
  f2fs: compress: remove unneeded code
  f2fs: remove duplicated type casting
  f2fs: support zone capacity less than zone size
  f2fs: update changes in upstream on GC_URGENT_HIGH
  f2fs: Return EOF on unaligned end of file DIO read
  f2fs: fix indefinite loop scanning for free nid
  f2fs: Fix type of section block count variables
  f2fs: prepare a waiter before entering io_schedule
  f2fs: update_sit_entry: Make the judgment condition of f2fs_bug_on more intuitive
  f2fs: replace test_and_set/clear_bit() with set/clear_bit()
  f2fs: make file immutable even if releasing zero compression block
  f2fs: compress: disable compression mount option if compression is off
  f2fs: compress: add sanity check during compressed cluster read
  f2fs: use macro instead of f2fs verity version
  f2fs: fix deadlock between quota writes and checkpoint
  f2fs: correct comment of f2fs_exist_written_data
  f2fs: compress: delay temp page allocation
  f2fs: compress: fix to update isize when overwriting compressed file
  f2fs: space related cleanup
  f2fs: fix use-after-free issue
  f2fs: Change the type of f2fs_flush_inline_data() to void
  f2fs: add F2FS_IOC_SEC_TRIM_FILE ioctl
  f2fs: segment.h: delete a duplicated word
  f2fs: compress: fix to avoid memory leak on cc->cpages
  f2fs: use generic names for generic ioctls
  f2fs: don't keep meta inode pages used for compressed block migration
  f2fs: fix error path in do_recover_data()
  f2fs: fix to wait GCed compressed page writeback
  f2fs: remove write attribute of main_blkaddr sysfs node
  f2fs: add GC_URGENT_LOW mode in gc_urgent
  f2fs: avoid readahead race condition
  f2fs: fix return value of move_data_block()
  f2fs: add parameter op_flag in f2fs_submit_page_read()
  f2fs: split f2fs_allocate_new_segments()
  f2fs: lost matching-pair of trace in f2fs_truncate_inode_blocks
  f2fs: fix an oops in f2fs_is_compressed_page
  f2fs: make trace enter and end in pairs for unlink
  f2fs: fix to check page dirty status before writeback
  f2fs: remove the unused compr parameter
  f2fs: support to trace f2fs_fiemap()
  f2fs: support to trace f2fs_bmap()
  f2fs: fix wrong return value of f2fs_bmap_compress()
  f2fs: remove useless parameter of __insert_free_nid()
  f2fs: fix typo in comment of f2fs_do_add_link
  f2fs: fix to wait page writeback before update
  f2fs: show more debug info for per-temperature log
  f2fs: add f2fs_gc exception handle in f2fs_ioc_gc_range
  f2fs: clean up parameter of f2fs_allocate_data_block()
  f2fs: shrink node_write lock coverage
  f2fs: add prefix for exported symbols
  f2fs: use kfree() to free variables allocated by match_strdup()
  f2fs: get the right gc victim section when section has several segments
  f2fs: fix a race condition between f2fs_write_end_io and f2fs_del_fsync_node_entry
  f2fs: remove useless truncate in f2fs_collapse_range()
  f2fs: use kfree() instead of kvfree() to free superblock data
  f2fs: avoid checkpatch error
  f2fs: should avoid inode eviction in synchronous path
This commit is contained in:
azrim 2022-05-20 14:51:11 +09:00
commit e9e83ae080
No known key found for this signature in database
GPG Key ID: 497F8FB059B45D1C
72 changed files with 10048 additions and 4024 deletions

View File

@ -22,7 +22,8 @@ Contact: "Namjae Jeon" <namjae.jeon@samsung.com>
Description: Controls the victim selection policy for garbage collection.
Setting gc_idle = 0(default) will disable this option. Setting
gc_idle = 1 will select the Cost Benefit approach & setting
gc_idle = 2 will select the greedy approach.
gc_idle = 2 will select the greedy approach & setting
gc_idle = 3 will select the age-threshold based approach.
What: /sys/fs/f2fs/<disk>/reclaim_segments
Date: October 2013
@ -37,18 +38,25 @@ Description: This parameter controls the number of prefree segments to be
What: /sys/fs/f2fs/<disk>/main_blkaddr
Date: November 2019
Contact: "Ramon Pantin" <pantin@google.com>
Description:
Shows first block address of MAIN area.
Description: Shows first block address of MAIN area.
What: /sys/fs/f2fs/<disk>/ipu_policy
Date: November 2013
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
Description: Controls the in-place-update policy.
updates in f2fs. User can set:
0x01: F2FS_IPU_FORCE, 0x02: F2FS_IPU_SSR,
0x04: F2FS_IPU_UTIL, 0x08: F2FS_IPU_SSR_UTIL,
0x10: F2FS_IPU_FSYNC, 0x20: F2FS_IPU_ASYNC,
0x40: F2FS_IPU_NOCACHE.
==== =================
0x01 F2FS_IPU_FORCE
0x02 F2FS_IPU_SSR
0x04 F2FS_IPU_UTIL
0x08 F2FS_IPU_SSR_UTIL
0x10 F2FS_IPU_FSYNC
0x20 F2FS_IPU_ASYNC
0x40 F2FS_IPU_NOCACHE
0x80 F2FS_IPU_HONOR_OPU_WRITE
==== =================
Refer segment.h for details.
What: /sys/fs/f2fs/<disk>/min_ipu_util
@ -88,6 +96,33 @@ Description: Controls the issue rate of discard commands that consist of small
checkpoint is triggered, and issued during the checkpoint.
By default, it is disabled with 0.
What: /sys/fs/f2fs/<disk>/max_discard_request
Date: December 2021
Contact: "Konstantin Vyshetsky" <vkon@google.com>
Description: Controls the number of discards a thread will issue at a time.
Higher number will allow the discard thread to finish its work
faster, at the cost of higher latency for incomming I/O.
What: /sys/fs/f2fs/<disk>/min_discard_issue_time
Date: December 2021
Contact: "Konstantin Vyshetsky" <vkon@google.com>
Description: Controls the interval the discard thread will wait between
issuing discard requests when there are discards to be issued and
no I/O aware interruptions occur.
What: /sys/fs/f2fs/<disk>/mid_discard_issue_time
Date: December 2021
Contact: "Konstantin Vyshetsky" <vkon@google.com>
Description: Controls the interval the discard thread will wait between
issuing discard requests when there are discards to be issued and
an I/O aware interruption occurs.
What: /sys/fs/f2fs/<disk>/max_discard_issue_time
Date: December 2021
Contact: "Konstantin Vyshetsky" <vkon@google.com>
Description: Controls the interval the discard thread will wait when there are
no discard operations to be issued.
What: /sys/fs/f2fs/<disk>/discard_granularity
Date: July 2017
Contact: "Chao Yu" <yuchao0@huawei.com>
@ -102,6 +137,11 @@ Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Set timeout to issue discard commands during umount.
Default: 5 secs
What: /sys/fs/f2fs/<disk>/pending_discard
Date: November 2021
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Shows the number of pending discard commands in the queue.
What: /sys/fs/f2fs/<disk>/max_victim_search
Date: January 2014
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
@ -192,7 +232,34 @@ Description: Shows total written kbytes issued to disk.
What: /sys/fs/f2fs/<disk>/feature
Date: July 2017
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Shows all enabled features in current device.
Description: <deprecated: should use /sys/fs/f2fs/<disk>/feature_list/
Shows all enabled features in current device.
Supported features:
encryption, blkzoned, extra_attr, projquota, inode_checksum,
flexible_inline_xattr, quota_ino, inode_crtime, lost_found,
verity, sb_checksum, casefold, readonly, compression, pin_file.
What: /sys/fs/f2fs/<disk>/feature_list/
Date: June 2021
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Expand /sys/fs/f2fs/<disk>/features to meet sysfs rule.
Supported on-disk features:
encryption, block_zoned (aka blkzoned), extra_attr,
project_quota (aka projquota), inode_checksum,
flexible_inline_xattr, quota_ino, inode_crtime, lost_found,
verity, sb_checksum, casefold, readonly, compression.
Note that, pin_file is moved into /sys/fs/f2fs/features/.
What: /sys/fs/f2fs/features/
Date: July 2017
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Shows all enabled kernel features.
Supported features:
encryption, block_zoned, extra_attr, project_quota,
inode_checksum, flexible_inline_xattr, quota_ino,
inode_crtime, lost_found, verity, sb_checksum,
casefold, readonly, compression, test_dummy_encryption_v2,
atomic_write, pin_file, encrypted_casefold.
What: /sys/fs/f2fs/<disk>/inject_rate
Date: May 2016
@ -227,9 +294,16 @@ Description: Shows current reserved blocks in system, it may be temporarily
What: /sys/fs/f2fs/<disk>/gc_urgent
Date: August 2017
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Do background GC agressively when set. When gc_urgent = 1,
background thread starts to do GC by given gc_urgent_sleep_time
interval. It is set to 0 by default.
Description: Do background GC aggressively when set. Set to 0 by default.
gc urgent high(1): does GC forcibly in a period of given
gc_urgent_sleep_time and ignores I/O idling check. uses greedy
GC approach and turns SSR mode on.
gc urgent low(2): lowers the bar of checking I/O idling in
order to process outstanding discard commands and GC a
little bit aggressively. uses cost benefit GC approach.
gc urgent mid(3): does GC forcibly in a period of given
gc_urgent_sleep_time and executes a mid level of I/O idling check.
uses cost benefit GC approach.
What: /sys/fs/f2fs/<disk>/gc_urgent_sleep_time
Date: August 2017
@ -263,7 +337,7 @@ Date April 2019
Contact: "Daniel Rosenberg" <drosen@google.com>
Description: If checkpoint=disable, it displays the number of blocks that
are unusable.
If checkpoint=enable it displays the enumber of blocks that
If checkpoint=enable it displays the number of blocks that
would be unusable if checkpoint=disable were to be set.
What: /sys/fs/f2fs/<disk>/encoding
@ -347,3 +421,143 @@ Date: April 2020
Contact: "Daeho Jeong" <daehojeong@google.com>
Description: Give a way to change iostat_period time. 3secs by default.
The new iostat trace gives stats gap given the period.
What: /sys/fs/f2fs/<disk>/max_io_bytes
Date: December 2020
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: This gives a control to limit the bio size in f2fs.
Default is zero, which will follow underlying block layer limit,
whereas, if it has a certain bytes value, f2fs won't submit a
bio larger than that size.
What: /sys/fs/f2fs/<disk>/stat/sb_status
Date: December 2020
Contact: "Chao Yu" <yuchao0@huawei.com>
Description: Show status of f2fs superblock in real time.
====== ===================== =================================
value sb status macro description
0x1 SBI_IS_DIRTY dirty flag for checkpoint
0x2 SBI_IS_CLOSE specify unmounting
0x4 SBI_NEED_FSCK need fsck.f2fs to fix
0x8 SBI_POR_DOING recovery is doing or not
0x10 SBI_NEED_SB_WRITE need to recover superblock
0x20 SBI_NEED_CP need to checkpoint
0x40 SBI_IS_SHUTDOWN shutdown by ioctl
0x80 SBI_IS_RECOVERED recovered orphan/data
0x100 SBI_CP_DISABLED CP was disabled last mount
0x200 SBI_CP_DISABLED_QUICK CP was disabled quickly
0x400 SBI_QUOTA_NEED_FLUSH need to flush quota info in CP
0x800 SBI_QUOTA_SKIP_FLUSH skip flushing quota in current CP
0x1000 SBI_QUOTA_NEED_REPAIR quota file may be corrupted
0x2000 SBI_IS_RESIZEFS resizefs is in process
0x4000 SBI_IS_FREEZING freefs is in process
====== ===================== =================================
What: /sys/fs/f2fs/<disk>/ckpt_thread_ioprio
Date: January 2021
Contact: "Daeho Jeong" <daehojeong@google.com>
Description: Give a way to change checkpoint merge daemon's io priority.
Its default value is "be,3", which means "BE" I/O class and
I/O priority "3". We can select the class between "rt" and "be",
and set the I/O priority within valid range of it. "," delimiter
is necessary in between I/O class and priority number.
What: /sys/fs/f2fs/<disk>/ovp_segments
Date: March 2021
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Shows the number of overprovision segments.
What: /sys/fs/f2fs/<disk>/compr_written_block
Date: March 2021
Contact: "Daeho Jeong" <daehojeong@google.com>
Description: Show the block count written after compression since mount. Note
that when the compressed blocks are deleted, this count doesn't
decrease. If you write "0" here, you can initialize
compr_written_block and compr_saved_block to "0".
What: /sys/fs/f2fs/<disk>/compr_saved_block
Date: March 2021
Contact: "Daeho Jeong" <daehojeong@google.com>
Description: Show the saved block count with compression since mount. Note
that when the compressed blocks are deleted, this count doesn't
decrease. If you write "0" here, you can initialize
compr_written_block and compr_saved_block to "0".
What: /sys/fs/f2fs/<disk>/compr_new_inode
Date: March 2021
Contact: "Daeho Jeong" <daehojeong@google.com>
Description: Show the count of inode newly enabled for compression since mount.
Note that when the compression is disabled for the files, this count
doesn't decrease. If you write "0" here, you can initialize
compr_new_inode to "0".
What: /sys/fs/f2fs/<disk>/atgc_candidate_ratio
Date: May 2021
Contact: "Chao Yu" <yuchao0@huawei.com>
Description: When ATGC is on, it controls candidate ratio in order to limit total
number of potential victim in all candidates, the value should be in
range of [0, 100], by default it was initialized as 20(%).
What: /sys/fs/f2fs/<disk>/atgc_candidate_count
Date: May 2021
Contact: "Chao Yu" <yuchao0@huawei.com>
Description: When ATGC is on, it controls candidate count in order to limit total
number of potential victim in all candidates, by default it was
initialized as 10 (sections).
What: /sys/fs/f2fs/<disk>/atgc_age_weight
Date: May 2021
Contact: "Chao Yu" <yuchao0@huawei.com>
Description: When ATGC is on, it controls age weight to balance weight proportion
in between aging and valid blocks, the value should be in range of
[0, 100], by default it was initialized as 60(%).
What: /sys/fs/f2fs/<disk>/atgc_age_threshold
Date: May 2021
Contact: "Chao Yu" <yuchao0@huawei.com>
Description: When ATGC is on, it controls age threshold to bypass GCing young
candidates whose age is not beyond the threshold, by default it was
initialized as 604800 seconds (equals to 7 days).
What: /sys/fs/f2fs/<disk>/gc_reclaimed_segments
Date: July 2021
Contact: "Daeho Jeong" <daehojeong@google.com>
Description: Show how many segments have been reclaimed by GC during a specific
GC mode (0: GC normal, 1: GC idle CB, 2: GC idle greedy,
3: GC idle AT, 4: GC urgent high, 5: GC urgent low 6: GC urgent mid)
You can re-initialize this value to "0".
What: /sys/fs/f2fs/<disk>/gc_segment_mode
Date: July 2021
Contact: "Daeho Jeong" <daehojeong@google.com>
Description: You can control for which gc mode the "gc_reclaimed_segments" node shows.
Refer to the description of the modes in "gc_reclaimed_segments".
What: /sys/fs/f2fs/<disk>/max_fragment_chunk
Date: August 2021
Contact: "Daeho Jeong" <daehojeong@google.com>
Description: With "mode=fragment:block" mount options, we can scatter block allocation.
f2fs will allocate 1..<max_fragment_chunk> blocks in a chunk and make a hole
in the length of 1..<max_fragment_hole> by turns. This value can be set
between 1..512 and the default value is 4.
What: /sys/fs/f2fs/<disk>/max_fragment_hole
Date: August 2021
Contact: "Daeho Jeong" <daehojeong@google.com>
Description: With "mode=fragment:block" mount options, we can scatter block allocation.
f2fs will allocate 1..<max_fragment_chunk> blocks in a chunk and make a hole
in the length of 1..<max_fragment_hole> by turns. This value can be set
between 1..512 and the default value is 4.
What: /sys/fs/f2fs/<disk>/gc_urgent_high_remaining
Date: December 2021
Contact: "Daeho Jeong" <daehojeong@google.com>
Description: You can set the trial count limit for GC urgent high mode with this value.
If GC thread gets to the limit, the mode will turn back to GC normal mode.
By default, the value is zero, which means there is no limit like before.
What: /sys/fs/f2fs/<disk>/max_roll_forward_node_blocks
Date: January 2022
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Controls max # of node block writes to be used for roll forward
recovery. This can limit the roll forward recovery time.

View File

@ -0,0 +1,956 @@
.. SPDX-License-Identifier: GPL-2.0
==========================================
WHAT IS Flash-Friendly File System (F2FS)?
==========================================
NAND flash memory-based storage devices, such as SSD, eMMC, and SD cards, have
been equipped on a variety systems ranging from mobile to server systems. Since
they are known to have different characteristics from the conventional rotating
disks, a file system, an upper layer to the storage device, should adapt to the
changes from the sketch in the design level.
F2FS is a file system exploiting NAND flash memory-based storage devices, which
is based on Log-structured File System (LFS). The design has been focused on
addressing the fundamental issues in LFS, which are snowball effect of wandering
tree and high cleaning overhead.
Since a NAND flash memory-based storage device shows different characteristic
according to its internal geometry or flash memory management scheme, namely FTL,
F2FS and its tools support various parameters not only for configuring on-disk
layout, but also for selecting allocation and cleaning algorithms.
The following git tree provides the file system formatting tool (mkfs.f2fs),
a consistency checking tool (fsck.f2fs), and a debugging tool (dump.f2fs).
- git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs-tools.git
For reporting bugs and sending patches, please use the following mailing list:
- linux-f2fs-devel@lists.sourceforge.net
Background and Design issues
============================
Log-structured File System (LFS)
--------------------------------
"A log-structured file system writes all modifications to disk sequentially in
a log-like structure, thereby speeding up both file writing and crash recovery.
The log is the only structure on disk; it contains indexing information so that
files can be read back from the log efficiently. In order to maintain large free
areas on disk for fast writing, we divide the log into segments and use a
segment cleaner to compress the live information from heavily fragmented
segments." from Rosenblum, M. and Ousterhout, J. K., 1992, "The design and
implementation of a log-structured file system", ACM Trans. Computer Systems
10, 1, 2652.
Wandering Tree Problem
----------------------
In LFS, when a file data is updated and written to the end of log, its direct
pointer block is updated due to the changed location. Then the indirect pointer
block is also updated due to the direct pointer block update. In this manner,
the upper index structures such as inode, inode map, and checkpoint block are
also updated recursively. This problem is called as wandering tree problem [1],
and in order to enhance the performance, it should eliminate or relax the update
propagation as much as possible.
[1] Bityutskiy, A. 2005. JFFS3 design issues. http://www.linux-mtd.infradead.org/
Cleaning Overhead
-----------------
Since LFS is based on out-of-place writes, it produces so many obsolete blocks
scattered across the whole storage. In order to serve new empty log space, it
needs to reclaim these obsolete blocks seamlessly to users. This job is called
as a cleaning process.
The process consists of three operations as follows.
1. A victim segment is selected through referencing segment usage table.
2. It loads parent index structures of all the data in the victim identified by
segment summary blocks.
3. It checks the cross-reference between the data and its parent index structure.
4. It moves valid data selectively.
This cleaning job may cause unexpected long delays, so the most important goal
is to hide the latencies to users. And also definitely, it should reduce the
amount of valid data to be moved, and move them quickly as well.
Key Features
============
Flash Awareness
---------------
- Enlarge the random write area for better performance, but provide the high
spatial locality
- Align FS data structures to the operational units in FTL as best efforts
Wandering Tree Problem
----------------------
- Use a term, “node”, that represents inodes as well as various pointer blocks
- Introduce Node Address Table (NAT) containing the locations of all the “node”
blocks; this will cut off the update propagation.
Cleaning Overhead
-----------------
- Support a background cleaning process
- Support greedy and cost-benefit algorithms for victim selection policies
- Support multi-head logs for static/dynamic hot and cold data separation
- Introduce adaptive logging for efficient block allocation
Mount Options
=============
======================== ============================================================
background_gc=%s Turn on/off cleaning operations, namely garbage
collection, triggered in background when I/O subsystem is
idle. If background_gc=on, it will turn on the garbage
collection and if background_gc=off, garbage collection
will be turned off. If background_gc=sync, it will turn
on synchronous garbage collection running in background.
Default value for this option is on. So garbage
collection is on by default.
gc_merge When background_gc is on, this option can be enabled to
let background GC thread to handle foreground GC requests,
it can eliminate the sluggish issue caused by slow foreground
GC operation when GC is triggered from a process with limited
I/O and CPU resources.
nogc_merge Disable GC merge feature.
disable_roll_forward Disable the roll-forward recovery routine
norecovery Disable the roll-forward recovery routine, mounted read-
only (i.e., -o ro,disable_roll_forward)
discard/nodiscard Enable/disable real-time discard in f2fs, if discard is
enabled, f2fs will issue discard/TRIM commands when a
segment is cleaned.
no_heap Disable heap-style segment allocation which finds free
segments for data from the beginning of main area, while
for node from the end of main area.
nouser_xattr Disable Extended User Attributes. Note: xattr is enabled
by default if CONFIG_F2FS_FS_XATTR is selected.
noacl Disable POSIX Access Control List. Note: acl is enabled
by default if CONFIG_F2FS_FS_POSIX_ACL is selected.
active_logs=%u Support configuring the number of active logs. In the
current design, f2fs supports only 2, 4, and 6 logs.
Default number is 6.
disable_ext_identify Disable the extension list configured by mkfs, so f2fs
is not aware of cold files such as media files.
inline_xattr Enable the inline xattrs feature.
noinline_xattr Disable the inline xattrs feature.
inline_xattr_size=%u Support configuring inline xattr size, it depends on
flexible inline xattr feature.
inline_data Enable the inline data feature: Newly created small (<~3.4k)
files can be written into inode block.
inline_dentry Enable the inline dir feature: data in newly created
directory entries can be written into inode block. The
space of inode block which is used to store inline
dentries is limited to ~3.4k.
noinline_dentry Disable the inline dentry feature.
flush_merge Merge concurrent cache_flush commands as much as possible
to eliminate redundant command issues. If the underlying
device handles the cache_flush command relatively slowly,
recommend to enable this option.
nobarrier This option can be used if underlying storage guarantees
its cached data should be written to the novolatile area.
If this option is set, no cache_flush commands are issued
but f2fs still guarantees the write ordering of all the
data writes.
fastboot This option is used when a system wants to reduce mount
time as much as possible, even though normal performance
can be sacrificed.
extent_cache Enable an extent cache based on rb-tree, it can cache
as many as extent which map between contiguous logical
address and physical address per inode, resulting in
increasing the cache hit ratio. Set by default.
noextent_cache Disable an extent cache based on rb-tree explicitly, see
the above extent_cache mount option.
noinline_data Disable the inline data feature, inline data feature is
enabled by default.
data_flush Enable data flushing before checkpoint in order to
persist data of regular and symlink.
reserve_root=%d Support configuring reserved space which is used for
allocation from a privileged user with specified uid or
gid, unit: 4KB, the default limit is 0.2% of user blocks.
resuid=%d The user ID which may use the reserved blocks.
resgid=%d The group ID which may use the reserved blocks.
fault_injection=%d Enable fault injection in all supported types with
specified injection rate.
fault_type=%d Support configuring fault injection type, should be
enabled with fault_injection option, fault type value
is shown below, it supports single or combined type.
=================== ===========
Type_Name Type_Value
=================== ===========
FAULT_KMALLOC 0x000000001
FAULT_KVMALLOC 0x000000002
FAULT_PAGE_ALLOC 0x000000004
FAULT_PAGE_GET 0x000000008
FAULT_ALLOC_BIO 0x000000010 (obsolete)
FAULT_ALLOC_NID 0x000000020
FAULT_ORPHAN 0x000000040
FAULT_BLOCK 0x000000080
FAULT_DIR_DEPTH 0x000000100
FAULT_EVICT_INODE 0x000000200
FAULT_TRUNCATE 0x000000400
FAULT_READ_IO 0x000000800
FAULT_CHECKPOINT 0x000001000
FAULT_DISCARD 0x000002000
FAULT_WRITE_IO 0x000004000
FAULT_SLAB_ALLOC 0x000008000
FAULT_DQUOT_INIT 0x000010000
FAULT_LOCK_OP 0x000020000
=================== ===========
mode=%s Control block allocation mode which supports "adaptive"
and "lfs". In "lfs" mode, there should be no random
writes towards main area.
"fragment:segment" and "fragment:block" are newly added here.
These are developer options for experiments to simulate filesystem
fragmentation/after-GC situation itself. The developers use these
modes to understand filesystem fragmentation/after-GC condition well,
and eventually get some insights to handle them better.
In "fragment:segment", f2fs allocates a new segment in ramdom
position. With this, we can simulate the after-GC condition.
In "fragment:block", we can scatter block allocation with
"max_fragment_chunk" and "max_fragment_hole" sysfs nodes.
We added some randomness to both chunk and hole size to make
it close to realistic IO pattern. So, in this mode, f2fs will allocate
1..<max_fragment_chunk> blocks in a chunk and make a hole in the
length of 1..<max_fragment_hole> by turns. With this, the newly
allocated blocks will be scattered throughout the whole partition.
Note that "fragment:block" implicitly enables "fragment:segment"
option for more randomness.
Please, use these options for your experiments and we strongly
recommend to re-format the filesystem after using these options.
io_bits=%u Set the bit size of write IO requests. It should be set
with "mode=lfs".
usrquota Enable plain user disk quota accounting.
grpquota Enable plain group disk quota accounting.
prjquota Enable plain project quota accounting.
usrjquota=<file> Appoint specified file and type during mount, so that quota
grpjquota=<file> information can be properly updated during recovery flow,
prjjquota=<file> <quota file>: must be in root directory;
jqfmt=<quota type> <quota type>: [vfsold,vfsv0,vfsv1].
offusrjquota Turn off user journalled quota.
offgrpjquota Turn off group journalled quota.
offprjjquota Turn off project journalled quota.
quota Enable plain user disk quota accounting.
noquota Disable all plain disk quota option.
whint_mode=%s Control which write hints are passed down to block
layer. This supports "off", "user-based", and
"fs-based". In "off" mode (default), f2fs does not pass
down hints. In "user-based" mode, f2fs tries to pass
down hints given by users. And in "fs-based" mode, f2fs
passes down hints with its policy.
alloc_mode=%s Adjust block allocation policy, which supports "reuse"
and "default".
fsync_mode=%s Control the policy of fsync. Currently supports "posix",
"strict", and "nobarrier". In "posix" mode, which is
default, fsync will follow POSIX semantics and does a
light operation to improve the filesystem performance.
In "strict" mode, fsync will be heavy and behaves in line
with xfs, ext4 and btrfs, where xfstest generic/342 will
pass, but the performance will regress. "nobarrier" is
based on "posix", but doesn't issue flush command for
non-atomic files likewise "nobarrier" mount option.
test_dummy_encryption
test_dummy_encryption=%s
Enable dummy encryption, which provides a fake fscrypt
context. The fake fscrypt context is used by xfstests.
The argument may be either "v1" or "v2", in order to
select the corresponding fscrypt policy version.
checkpoint=%s[:%u[%]] Set to "disable" to turn off checkpointing. Set to "enable"
to reenable checkpointing. Is enabled by default. While
disabled, any unmounting or unexpected shutdowns will cause
the filesystem contents to appear as they did when the
filesystem was mounted with that option.
While mounting with checkpoint=disabled, the filesystem must
run garbage collection to ensure that all available space can
be used. If this takes too much time, the mount may return
EAGAIN. You may optionally add a value to indicate how much
of the disk you would be willing to temporarily give up to
avoid additional garbage collection. This can be given as a
number of blocks, or as a percent. For instance, mounting
with checkpoint=disable:100% would always succeed, but it may
hide up to all remaining free space. The actual space that
would be unusable can be viewed at /sys/fs/f2fs/<disk>/unusable
This space is reclaimed once checkpoint=enable.
checkpoint_merge When checkpoint is enabled, this can be used to create a kernel
daemon and make it to merge concurrent checkpoint requests as
much as possible to eliminate redundant checkpoint issues. Plus,
we can eliminate the sluggish issue caused by slow checkpoint
operation when the checkpoint is done in a process context in
a cgroup having low i/o budget and cpu shares. To make this
do better, we set the default i/o priority of the kernel daemon
to "3", to give one higher priority than other kernel threads.
This is the same way to give a I/O priority to the jbd2
journaling thread of ext4 filesystem.
nocheckpoint_merge Disable checkpoint merge feature.
compress_algorithm=%s Control compress algorithm, currently f2fs supports "lzo",
"lz4", "zstd" and "lzo-rle" algorithm.
compress_algorithm=%s:%d Control compress algorithm and its compress level, now, only
"lz4" and "zstd" support compress level config.
algorithm level range
lz4 3 - 16
zstd 1 - 22
compress_log_size=%u Support configuring compress cluster size, the size will
be 4KB * (1 << %u), 16KB is minimum size, also it's
default size.
compress_extension=%s Support adding specified extension, so that f2fs can enable
compression on those corresponding files, e.g. if all files
with '.ext' has high compression rate, we can set the '.ext'
on compression extension list and enable compression on
these file by default rather than to enable it via ioctl.
For other files, we can still enable compression via ioctl.
Note that, there is one reserved special extension '*', it
can be set to enable compression for all files.
nocompress_extension=%s Support adding specified extension, so that f2fs can disable
compression on those corresponding files, just contrary to compression extension.
If you know exactly which files cannot be compressed, you can use this.
The same extension name can't appear in both compress and nocompress
extension at the same time.
If the compress extension specifies all files, the types specified by the
nocompress extension will be treated as special cases and will not be compressed.
Don't allow use '*' to specifie all file in nocompress extension.
After add nocompress_extension, the priority should be:
dir_flag < comp_extention,nocompress_extension < comp_file_flag,no_comp_file_flag.
See more in compression sections.
compress_chksum Support verifying chksum of raw data in compressed cluster.
compress_mode=%s Control file compression mode. This supports "fs" and "user"
modes. In "fs" mode (default), f2fs does automatic compression
on the compression enabled files. In "user" mode, f2fs disables
the automaic compression and gives the user discretion of
choosing the target file and the timing. The user can do manual
compression/decompression on the compression enabled files using
ioctls.
compress_cache Support to use address space of a filesystem managed inode to
cache compressed block, in order to improve cache hit ratio of
random read.
inlinecrypt When possible, encrypt/decrypt the contents of encrypted
files using the blk-crypto framework rather than
filesystem-layer encryption. This allows the use of
inline encryption hardware. The on-disk format is
unaffected. For more details, see
Documentation/block/inline-encryption.rst.
atgc Enable age-threshold garbage collection, it provides high
effectiveness and efficiency on background GC.
discard_unit=%s Control discard unit, the argument can be "block", "segment"
and "section", issued discard command's offset/size will be
aligned to the unit, by default, "discard_unit=block" is set,
so that small discard functionality is enabled.
For blkzoned device, "discard_unit=section" will be set by
default, it is helpful for large sized SMR or ZNS devices to
reduce memory cost by getting rid of fs metadata supports small
discard.
======================== ============================================================
Debugfs Entries
===============
/sys/kernel/debug/f2fs/ contains information about all the partitions mounted as
f2fs. Each file shows the whole f2fs information.
/sys/kernel/debug/f2fs/status includes:
- major file system information managed by f2fs currently
- average SIT information about whole segments
- current memory footprint consumed by f2fs.
Sysfs Entries
=============
Information about mounted f2fs file systems can be found in
/sys/fs/f2fs. Each mounted filesystem will have a directory in
/sys/fs/f2fs based on its device name (i.e., /sys/fs/f2fs/sda).
The files in each per-device directory are shown in table below.
Files in /sys/fs/f2fs/<devname>
(see also Documentation/ABI/testing/sysfs-fs-f2fs)
Usage
=====
1. Download userland tools and compile them.
2. Skip, if f2fs was compiled statically inside kernel.
Otherwise, insert the f2fs.ko module::
# insmod f2fs.ko
3. Create a directory to use when mounting::
# mkdir /mnt/f2fs
4. Format the block device, and then mount as f2fs::
# mkfs.f2fs -l label /dev/block_device
# mount -t f2fs /dev/block_device /mnt/f2fs
mkfs.f2fs
---------
The mkfs.f2fs is for the use of formatting a partition as the f2fs filesystem,
which builds a basic on-disk layout.
The quick options consist of:
=============== ===========================================================
``-l [label]`` Give a volume label, up to 512 unicode name.
``-a [0 or 1]`` Split start location of each area for heap-based allocation.
1 is set by default, which performs this.
``-o [int]`` Set overprovision ratio in percent over volume size.
5 is set by default.
``-s [int]`` Set the number of segments per section.
1 is set by default.
``-z [int]`` Set the number of sections per zone.
1 is set by default.
``-e [str]`` Set basic extension list. e.g. "mp3,gif,mov"
``-t [0 or 1]`` Disable discard command or not.
1 is set by default, which conducts discard.
=============== ===========================================================
Note: please refer to the manpage of mkfs.f2fs(8) to get full option list.
fsck.f2fs
---------
The fsck.f2fs is a tool to check the consistency of an f2fs-formatted
partition, which examines whether the filesystem metadata and user-made data
are cross-referenced correctly or not.
Note that, initial version of the tool does not fix any inconsistency.
The quick options consist of::
-d debug level [default:0]
Note: please refer to the manpage of fsck.f2fs(8) to get full option list.
dump.f2fs
---------
The dump.f2fs shows the information of specific inode and dumps SSA and SIT to
file. Each file is dump_ssa and dump_sit.
The dump.f2fs is used to debug on-disk data structures of the f2fs filesystem.
It shows on-disk inode information recognized by a given inode number, and is
able to dump all the SSA and SIT entries into predefined files, ./dump_ssa and
./dump_sit respectively.
The options consist of::
-d debug level [default:0]
-i inode no (hex)
-s [SIT dump segno from #1~#2 (decimal), for all 0~-1]
-a [SSA dump segno from #1~#2 (decimal), for all 0~-1]
Examples::
# dump.f2fs -i [ino] /dev/sdx
# dump.f2fs -s 0~-1 /dev/sdx (SIT dump)
# dump.f2fs -a 0~-1 /dev/sdx (SSA dump)
Note: please refer to the manpage of dump.f2fs(8) to get full option list.
sload.f2fs
----------
The sload.f2fs gives a way to insert files and directories in the exisiting disk
image. This tool is useful when building f2fs images given compiled files.
Note: please refer to the manpage of sload.f2fs(8) to get full option list.
resize.f2fs
-----------
The resize.f2fs lets a user resize the f2fs-formatted disk image, while preserving
all the files and directories stored in the image.
Note: please refer to the manpage of resize.f2fs(8) to get full option list.
defrag.f2fs
-----------
The defrag.f2fs can be used to defragment scattered written data as well as
filesystem metadata across the disk. This can improve the write speed by giving
more free consecutive space.
Note: please refer to the manpage of defrag.f2fs(8) to get full option list.
f2fs_io
-------
The f2fs_io is a simple tool to issue various filesystem APIs as well as
f2fs-specific ones, which is very useful for QA tests.
Note: please refer to the manpage of f2fs_io(8) to get full option list.
Design
======
On-disk Layout
--------------
F2FS divides the whole volume into a number of segments, each of which is fixed
to 2MB in size. A section is composed of consecutive segments, and a zone
consists of a set of sections. By default, section and zone sizes are set to one
segment size identically, but users can easily modify the sizes by mkfs.
F2FS splits the entire volume into six areas, and all the areas except superblock
consist of multiple segments as described below::
align with the zone size <-|
|-> align with the segment size
_________________________________________________________________________
| | | Segment | Node | Segment | |
| Superblock | Checkpoint | Info. | Address | Summary | Main |
| (SB) | (CP) | Table (SIT) | Table (NAT) | Area (SSA) | |
|____________|_____2______|______N______|______N______|______N_____|__N___|
. .
. .
. .
._________________________________________.
|_Segment_|_..._|_Segment_|_..._|_Segment_|
. .
._________._________
|_section_|__...__|_
. .
.________.
|__zone__|
- Superblock (SB)
It is located at the beginning of the partition, and there exist two copies
to avoid file system crash. It contains basic partition information and some
default parameters of f2fs.
- Checkpoint (CP)
It contains file system information, bitmaps for valid NAT/SIT sets, orphan
inode lists, and summary entries of current active segments.
- Segment Information Table (SIT)
It contains segment information such as valid block count and bitmap for the
validity of all the blocks.
- Node Address Table (NAT)
It is composed of a block address table for all the node blocks stored in
Main area.
- Segment Summary Area (SSA)
It contains summary entries which contains the owner information of all the
data and node blocks stored in Main area.
- Main Area
It contains file and directory data including their indices.
In order to avoid misalignment between file system and flash-based storage, F2FS
aligns the start block address of CP with the segment size. Also, it aligns the
start block address of Main area with the zone size by reserving some segments
in SSA area.
Reference the following survey for additional technical details.
https://wiki.linaro.org/WorkingGroups/Kernel/Projects/FlashCardSurvey
File System Metadata Structure
------------------------------
F2FS adopts the checkpointing scheme to maintain file system consistency. At
mount time, F2FS first tries to find the last valid checkpoint data by scanning
CP area. In order to reduce the scanning time, F2FS uses only two copies of CP.
One of them always indicates the last valid data, which is called as shadow copy
mechanism. In addition to CP, NAT and SIT also adopt the shadow copy mechanism.
For file system consistency, each CP points to which NAT and SIT copies are
valid, as shown as below::
+--------+----------+---------+
| CP | SIT | NAT |
+--------+----------+---------+
. . . .
. . . .
. . . .
+-------+-------+--------+--------+--------+--------+
| CP #0 | CP #1 | SIT #0 | SIT #1 | NAT #0 | NAT #1 |
+-------+-------+--------+--------+--------+--------+
| ^ ^
| | |
`----------------------------------------'
Index Structure
---------------
The key data structure to manage the data locations is a "node". Similar to
traditional file structures, F2FS has three types of node: inode, direct node,
indirect node. F2FS assigns 4KB to an inode block which contains 923 data block
indices, two direct node pointers, two indirect node pointers, and one double
indirect node pointer as described below. One direct node block contains 1018
data blocks, and one indirect node block contains also 1018 node blocks. Thus,
one inode block (i.e., a file) covers::
4KB * (923 + 2 * 1018 + 2 * 1018 * 1018 + 1018 * 1018 * 1018) := 3.94TB.
Inode block (4KB)
|- data (923)
|- direct node (2)
| `- data (1018)
|- indirect node (2)
| `- direct node (1018)
| `- data (1018)
`- double indirect node (1)
`- indirect node (1018)
`- direct node (1018)
`- data (1018)
Note that all the node blocks are mapped by NAT which means the location of
each node is translated by the NAT table. In the consideration of the wandering
tree problem, F2FS is able to cut off the propagation of node updates caused by
leaf data writes.
Directory Structure
-------------------
A directory entry occupies 11 bytes, which consists of the following attributes.
- hash hash value of the file name
- ino inode number
- len the length of file name
- type file type such as directory, symlink, etc
A dentry block consists of 214 dentry slots and file names. Therein a bitmap is
used to represent whether each dentry is valid or not. A dentry block occupies
4KB with the following composition.
::
Dentry Block(4 K) = bitmap (27 bytes) + reserved (3 bytes) +
dentries(11 * 214 bytes) + file name (8 * 214 bytes)
[Bucket]
+--------------------------------+
|dentry block 1 | dentry block 2 |
+--------------------------------+
. .
. .
. [Dentry Block Structure: 4KB] .
+--------+----------+----------+------------+
| bitmap | reserved | dentries | file names |
+--------+----------+----------+------------+
[Dentry Block: 4KB] . .
. .
. .
+------+------+-----+------+
| hash | ino | len | type |
+------+------+-----+------+
[Dentry Structure: 11 bytes]
F2FS implements multi-level hash tables for directory structure. Each level has
a hash table with dedicated number of hash buckets as shown below. Note that
"A(2B)" means a bucket includes 2 data blocks.
::
----------------------
A : bucket
B : block
N : MAX_DIR_HASH_DEPTH
----------------------
level #0 | A(2B)
|
level #1 | A(2B) - A(2B)
|
level #2 | A(2B) - A(2B) - A(2B) - A(2B)
. | . . . .
level #N/2 | A(2B) - A(2B) - A(2B) - A(2B) - A(2B) - ... - A(2B)
. | . . . .
level #N | A(4B) - A(4B) - A(4B) - A(4B) - A(4B) - ... - A(4B)
The number of blocks and buckets are determined by::
,- 2, if n < MAX_DIR_HASH_DEPTH / 2,
# of blocks in level #n = |
`- 4, Otherwise
,- 2^(n + dir_level),
| if n + dir_level < MAX_DIR_HASH_DEPTH / 2,
# of buckets in level #n = |
`- 2^((MAX_DIR_HASH_DEPTH / 2) - 1),
Otherwise
When F2FS finds a file name in a directory, at first a hash value of the file
name is calculated. Then, F2FS scans the hash table in level #0 to find the
dentry consisting of the file name and its inode number. If not found, F2FS
scans the next hash table in level #1. In this way, F2FS scans hash tables in
each levels incrementally from 1 to N. In each level F2FS needs to scan only
one bucket determined by the following equation, which shows O(log(# of files))
complexity::
bucket number to scan in level #n = (hash value) % (# of buckets in level #n)
In the case of file creation, F2FS finds empty consecutive slots that cover the
file name. F2FS searches the empty slots in the hash tables of whole levels from
1 to N in the same way as the lookup operation.
The following figure shows an example of two cases holding children::
--------------> Dir <--------------
| |
child child
child - child [hole] - child
child - child - child [hole] - [hole] - child
Case 1: Case 2:
Number of children = 6, Number of children = 3,
File size = 7 File size = 7
Default Block Allocation
------------------------
At runtime, F2FS manages six active logs inside "Main" area: Hot/Warm/Cold node
and Hot/Warm/Cold data.
- Hot node contains direct node blocks of directories.
- Warm node contains direct node blocks except hot node blocks.
- Cold node contains indirect node blocks
- Hot data contains dentry blocks
- Warm data contains data blocks except hot and cold data blocks
- Cold data contains multimedia data or migrated data blocks
LFS has two schemes for free space management: threaded log and copy-and-compac-
tion. The copy-and-compaction scheme which is known as cleaning, is well-suited
for devices showing very good sequential write performance, since free segments
are served all the time for writing new data. However, it suffers from cleaning
overhead under high utilization. Contrarily, the threaded log scheme suffers
from random writes, but no cleaning process is needed. F2FS adopts a hybrid
scheme where the copy-and-compaction scheme is adopted by default, but the
policy is dynamically changed to the threaded log scheme according to the file
system status.
In order to align F2FS with underlying flash-based storage, F2FS allocates a
segment in a unit of section. F2FS expects that the section size would be the
same as the unit size of garbage collection in FTL. Furthermore, with respect
to the mapping granularity in FTL, F2FS allocates each section of the active
logs from different zones as much as possible, since FTL can write the data in
the active logs into one allocation unit according to its mapping granularity.
Cleaning process
----------------
F2FS does cleaning both on demand and in the background. On-demand cleaning is
triggered when there are not enough free segments to serve VFS calls. Background
cleaner is operated by a kernel thread, and triggers the cleaning job when the
system is idle.
F2FS supports two victim selection policies: greedy and cost-benefit algorithms.
In the greedy algorithm, F2FS selects a victim segment having the smallest number
of valid blocks. In the cost-benefit algorithm, F2FS selects a victim segment
according to the segment age and the number of valid blocks in order to address
log block thrashing problem in the greedy algorithm. F2FS adopts the greedy
algorithm for on-demand cleaner, while background cleaner adopts cost-benefit
algorithm.
In order to identify whether the data in the victim segment are valid or not,
F2FS manages a bitmap. Each bit represents the validity of a block, and the
bitmap is composed of a bit stream covering whole blocks in main area.
Write-hint Policy
-----------------
1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET.
2) whint_mode=user-based. F2FS tries to pass down hints given by
users.
===================== ======================== ===================
User F2FS Block
===================== ======================== ===================
N/A META WRITE_LIFE_NOT_SET
N/A HOT_NODE "
N/A WARM_NODE "
N/A COLD_NODE "
ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME
extension list " "
-- buffered io
WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
WRITE_LIFE_NONE " "
WRITE_LIFE_MEDIUM " "
WRITE_LIFE_LONG " "
-- direct io
WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
WRITE_LIFE_NONE " WRITE_LIFE_NONE
WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM
WRITE_LIFE_LONG " WRITE_LIFE_LONG
===================== ======================== ===================
3) whint_mode=fs-based. F2FS passes down hints with its policy.
===================== ======================== ===================
User F2FS Block
===================== ======================== ===================
N/A META WRITE_LIFE_MEDIUM;
N/A HOT_NODE WRITE_LIFE_NOT_SET
N/A WARM_NODE "
N/A COLD_NODE WRITE_LIFE_NONE
ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME
extension list " "
-- buffered io
WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG
WRITE_LIFE_NONE " "
WRITE_LIFE_MEDIUM " "
WRITE_LIFE_LONG " "
-- direct io
WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
WRITE_LIFE_NONE " WRITE_LIFE_NONE
WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM
WRITE_LIFE_LONG " WRITE_LIFE_LONG
===================== ======================== ===================
Fallocate(2) Policy
-------------------
The default policy follows the below POSIX rule.
Allocating disk space
The default operation (i.e., mode is zero) of fallocate() allocates
the disk space within the range specified by offset and len. The
file size (as reported by stat(2)) will be changed if offset+len is
greater than the file size. Any subregion within the range specified
by offset and len that did not contain data before the call will be
initialized to zero. This default behavior closely resembles the
behavior of the posix_fallocate(3) library function, and is intended
as a method of optimally implementing that function.
However, once F2FS receives ioctl(fd, F2FS_IOC_SET_PIN_FILE) in prior to
fallocate(fd, DEFAULT_MODE), it allocates on-disk block addressess having
zero or random data, which is useful to the below scenario where:
1. create(fd)
2. ioctl(fd, F2FS_IOC_SET_PIN_FILE)
3. fallocate(fd, 0, 0, size)
4. address = fibmap(fd, offset)
5. open(blkdev)
6. write(blkdev, address)
Compression implementation
--------------------------
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special block address is used to indicate
a cluster is a compressed one or normal one; for compressed cluster, following
metadata maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs
stores data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in cluster contain valid data and compress ratio of
cluster data is lower than specified threshold.
- To enable compression on regular inode, there are four ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
* mount w/ -o compress_extension=*; touch any_file
- To disable compression on regular inode, there are two ways:
* chattr -c file
* mount w/ -o nocompress_extension=ext; touch file.ext
- Priority in between FS_COMPR_FL, FS_NOCOMP_FS, extensions:
* compress_extension=so; nocompress_extension=zip; chattr +c dir; touch
dir/foo.so; touch dir/bar.zip; touch dir/baz.txt; then foo.so and baz.txt
should be compresse, bar.zip should be non-compressed. chattr +c dir/bar.zip
can enable compress on bar.zip.
* compress_extension=so; nocompress_extension=zip; chattr -c dir; touch
dir/foo.so; touch dir/bar.zip; touch dir/baz.txt; then foo.so should be
compresse, bar.zip and baz.txt should be non-compressed.
chattr+c dir/bar.zip; chattr+c dir/baz.txt; can enable compress on bar.zip
and baz.txt.
- At this point, compression feature doesn't expose compressed space to user
directly in order to guarantee potential data updates later to the space.
Instead, the main goal is to reduce data writes to flash disk as much as
possible, resulting in extending disk life time as well as relaxing IO
congestion. Alternatively, we've added ioctl(F2FS_IOC_RELEASE_COMPRESS_BLOCKS)
interface to reclaim compressed space and show it to user after putting the
immutable bit. Immutable bit, after release, it doesn't allow writing/mmaping
on the file, until reserving compressed space via
ioctl(F2FS_IOC_RESERVE_COMPRESS_BLOCKS) or truncating filesize to zero.
Compress metadata layout::
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Compression mode
--------------------------
f2fs supports "fs" and "user" compression modes with "compression_mode" mount option.
With this option, f2fs provides a choice to select the way how to compress the
compression enabled files (refer to "Compression implementation" section for how to
enable compression on a regular inode).
1) compress_mode=fs
This is the default option. f2fs does automatic compression in the writeback of the
compression enabled files.
2) compress_mode=user
This disables the automatic compression and gives the user discretion of choosing the
target file and the timing. The user can do manual compression/decompression on the
compression enabled files using F2FS_IOC_DECOMPRESS_FILE and F2FS_IOC_COMPRESS_FILE
ioctls like the below.
To decompress a file,
fd = open(filename, O_WRONLY, 0);
ret = ioctl(fd, F2FS_IOC_DECOMPRESS_FILE);
To compress a file,
fd = open(filename, O_WRONLY, 0);
ret = ioctl(fd, F2FS_IOC_COMPRESS_FILE);
NVMe Zoned Namespace devices
----------------------------
- ZNS defines a per-zone capacity which can be equal or less than the
zone-size. Zone-capacity is the number of usable blocks in the zone.
F2FS checks if zone-capacity is less than zone-size, if it is, then any
segment which starts after the zone-capacity is marked as not-free in
the free segment bitmap at initial mount time. These segments are marked
as permanently used so they are not allocated for writes and
consequently are not needed to be garbage collected. In case the
zone-capacity is not aligned to default segment size(2MB), then a segment
can start before the zone-capacity and span across zone-capacity boundary.
Such spanning segments are also considered as usable segments. All blocks
past the zone-capacity are considered unusable in these segments.

View File

@ -1,674 +0,0 @@
================================================================================
WHAT IS Flash-Friendly File System (F2FS)?
================================================================================
NAND flash memory-based storage devices, such as SSD, eMMC, and SD cards, have
been equipped on a variety systems ranging from mobile to server systems. Since
they are known to have different characteristics from the conventional rotating
disks, a file system, an upper layer to the storage device, should adapt to the
changes from the sketch in the design level.
F2FS is a file system exploiting NAND flash memory-based storage devices, which
is based on Log-structured File System (LFS). The design has been focused on
addressing the fundamental issues in LFS, which are snowball effect of wandering
tree and high cleaning overhead.
Since a NAND flash memory-based storage device shows different characteristic
according to its internal geometry or flash memory management scheme, namely FTL,
F2FS and its tools support various parameters not only for configuring on-disk
layout, but also for selecting allocation and cleaning algorithms.
The following git tree provides the file system formatting tool (mkfs.f2fs),
a consistency checking tool (fsck.f2fs), and a debugging tool (dump.f2fs).
>> git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs-tools.git
For reporting bugs and sending patches, please use the following mailing list:
>> linux-f2fs-devel@lists.sourceforge.net
================================================================================
BACKGROUND AND DESIGN ISSUES
================================================================================
Log-structured File System (LFS)
--------------------------------
"A log-structured file system writes all modifications to disk sequentially in
a log-like structure, thereby speeding up both file writing and crash recovery.
The log is the only structure on disk; it contains indexing information so that
files can be read back from the log efficiently. In order to maintain large free
areas on disk for fast writing, we divide the log into segments and use a
segment cleaner to compress the live information from heavily fragmented
segments." from Rosenblum, M. and Ousterhout, J. K., 1992, "The design and
implementation of a log-structured file system", ACM Trans. Computer Systems
10, 1, 2652.
Wandering Tree Problem
----------------------
In LFS, when a file data is updated and written to the end of log, its direct
pointer block is updated due to the changed location. Then the indirect pointer
block is also updated due to the direct pointer block update. In this manner,
the upper index structures such as inode, inode map, and checkpoint block are
also updated recursively. This problem is called as wandering tree problem [1],
and in order to enhance the performance, it should eliminate or relax the update
propagation as much as possible.
[1] Bityutskiy, A. 2005. JFFS3 design issues. http://www.linux-mtd.infradead.org/
Cleaning Overhead
-----------------
Since LFS is based on out-of-place writes, it produces so many obsolete blocks
scattered across the whole storage. In order to serve new empty log space, it
needs to reclaim these obsolete blocks seamlessly to users. This job is called
as a cleaning process.
The process consists of three operations as follows.
1. A victim segment is selected through referencing segment usage table.
2. It loads parent index structures of all the data in the victim identified by
segment summary blocks.
3. It checks the cross-reference between the data and its parent index structure.
4. It moves valid data selectively.
This cleaning job may cause unexpected long delays, so the most important goal
is to hide the latencies to users. And also definitely, it should reduce the
amount of valid data to be moved, and move them quickly as well.
================================================================================
KEY FEATURES
================================================================================
Flash Awareness
---------------
- Enlarge the random write area for better performance, but provide the high
spatial locality
- Align FS data structures to the operational units in FTL as best efforts
Wandering Tree Problem
----------------------
- Use a term, “node”, that represents inodes as well as various pointer blocks
- Introduce Node Address Table (NAT) containing the locations of all the “node”
blocks; this will cut off the update propagation.
Cleaning Overhead
-----------------
- Support a background cleaning process
- Support greedy and cost-benefit algorithms for victim selection policies
- Support multi-head logs for static/dynamic hot and cold data separation
- Introduce adaptive logging for efficient block allocation
================================================================================
MOUNT OPTIONS
================================================================================
background_gc=%s Turn on/off cleaning operations, namely garbage
collection, triggered in background when I/O subsystem is
idle. If background_gc=on, it will turn on the garbage
collection and if background_gc=off, garbage collection
will be turned off. If background_gc=sync, it will turn
on synchronous garbage collection running in background.
Default value for this option is on. So garbage
collection is on by default.
disable_roll_forward Disable the roll-forward recovery routine
norecovery Disable the roll-forward recovery routine, mounted read-
only (i.e., -o ro,disable_roll_forward)
discard/nodiscard Enable/disable real-time discard in f2fs, if discard is
enabled, f2fs will issue discard/TRIM commands when a
segment is cleaned.
no_heap Disable heap-style segment allocation which finds free
segments for data from the beginning of main area, while
for node from the end of main area.
nouser_xattr Disable Extended User Attributes. Note: xattr is enabled
by default if CONFIG_F2FS_FS_XATTR is selected.
noacl Disable POSIX Access Control List. Note: acl is enabled
by default if CONFIG_F2FS_FS_POSIX_ACL is selected.
active_logs=%u Support configuring the number of active logs. In the
current design, f2fs supports only 2, 4, and 6 logs.
Default number is 6.
disable_ext_identify Disable the extension list configured by mkfs, so f2fs
does not aware of cold files such as media files.
inline_xattr Enable the inline xattrs feature.
noinline_xattr Disable the inline xattrs feature.
inline_xattr_size=%u Support configuring inline xattr size, it depends on
flexible inline xattr feature.
inline_data Enable the inline data feature: New created small(<~3.4k)
files can be written into inode block.
inline_dentry Enable the inline dir feature: data in new created
directory entries can be written into inode block. The
space of inode block which is used to store inline
dentries is limited to ~3.4k.
noinline_dentry Disable the inline dentry feature.
flush_merge Merge concurrent cache_flush commands as much as possible
to eliminate redundant command issues. If the underlying
device handles the cache_flush command relatively slowly,
recommend to enable this option.
nobarrier This option can be used if underlying storage guarantees
its cached data should be written to the novolatile area.
If this option is set, no cache_flush commands are issued
but f2fs still guarantees the write ordering of all the
data writes.
fastboot This option is used when a system wants to reduce mount
time as much as possible, even though normal performance
can be sacrificed.
extent_cache Enable an extent cache based on rb-tree, it can cache
as many as extent which map between contiguous logical
address and physical address per inode, resulting in
increasing the cache hit ratio. Set by default.
noextent_cache Disable an extent cache based on rb-tree explicitly, see
the above extent_cache mount option.
noinline_data Disable the inline data feature, inline data feature is
enabled by default.
data_flush Enable data flushing before checkpoint in order to
persist data of regular and symlink.
reserve_root=%d Support configuring reserved space which is used for
allocation from a privileged user with specified uid or
gid, unit: 4KB, the default limit is 0.2% of user blocks.
resuid=%d The user ID which may use the reserved blocks.
resgid=%d The group ID which may use the reserved blocks.
fault_injection=%d Enable fault injection in all supported types with
specified injection rate.
fault_type=%d Support configuring fault injection type, should be
enabled with fault_injection option, fault type value
is shown below, it supports single or combined type.
Type_Name Type_Value
FAULT_KMALLOC 0x000000001
FAULT_KVMALLOC 0x000000002
FAULT_PAGE_ALLOC 0x000000004
FAULT_PAGE_GET 0x000000008
FAULT_ALLOC_BIO 0x000000010
FAULT_ALLOC_NID 0x000000020
FAULT_ORPHAN 0x000000040
FAULT_BLOCK 0x000000080
FAULT_DIR_DEPTH 0x000000100
FAULT_EVICT_INODE 0x000000200
FAULT_TRUNCATE 0x000000400
FAULT_READ_IO 0x000000800
FAULT_CHECKPOINT 0x000001000
FAULT_DISCARD 0x000002000
FAULT_WRITE_IO 0x000004000
mode=%s Control block allocation mode which supports "adaptive"
and "lfs". In "lfs" mode, there should be no random
writes towards main area.
io_bits=%u Set the bit size of write IO requests. It should be set
with "mode=lfs".
usrquota Enable plain user disk quota accounting.
grpquota Enable plain group disk quota accounting.
prjquota Enable plain project quota accounting.
usrjquota=<file> Appoint specified file and type during mount, so that quota
grpjquota=<file> information can be properly updated during recovery flow,
prjjquota=<file> <quota file>: must be in root directory;
jqfmt=<quota type> <quota type>: [vfsold,vfsv0,vfsv1].
offusrjquota Turn off user journelled quota.
offgrpjquota Turn off group journelled quota.
offprjjquota Turn off project journelled quota.
quota Enable plain user disk quota accounting.
noquota Disable all plain disk quota option.
whint_mode=%s Control which write hints are passed down to block
layer. This supports "off", "user-based", and
"fs-based". In "off" mode (default), f2fs does not pass
down hints. In "user-based" mode, f2fs tries to pass
down hints given by users. And in "fs-based" mode, f2fs
passes down hints with its policy.
alloc_mode=%s Adjust block allocation policy, which supports "reuse"
and "default".
fsync_mode=%s Control the policy of fsync. Currently supports "posix",
"strict", and "nobarrier". In "posix" mode, which is
default, fsync will follow POSIX semantics and does a
light operation to improve the filesystem performance.
In "strict" mode, fsync will be heavy and behaves in line
with xfs, ext4 and btrfs, where xfstest generic/342 will
pass, but the performance will regress. "nobarrier" is
based on "posix", but doesn't issue flush command for
non-atomic files likewise "nobarrier" mount option.
test_dummy_encryption
test_dummy_encryption=%s
Enable dummy encryption, which provides a fake fscrypt
context. The fake fscrypt context is used by xfstests.
The argument may be either "v1" or "v2", in order to
select the corresponding fscrypt policy version.
checkpoint=%s[:%u[%]] Set to "disable" to turn off checkpointing. Set to "enable"
to reenable checkpointing. Is enabled by default. While
disabled, any unmounting or unexpected shutdowns will cause
the filesystem contents to appear as they did when the
filesystem was mounted with that option.
While mounting with checkpoint=disabled, the filesystem must
run garbage collection to ensure that all available space can
be used. If this takes too much time, the mount may return
EAGAIN. You may optionally add a value to indicate how much
of the disk you would be willing to temporarily give up to
avoid additional garbage collection. This can be given as a
number of blocks, or as a percent. For instance, mounting
with checkpoint=disable:100% would always succeed, but it may
hide up to all remaining free space. The actual space that
would be unusable can be viewed at /sys/fs/f2fs/<disk>/unusable
This space is reclaimed once checkpoint=enable.
compress_algorithm=%s Control compress algorithm, currently f2fs supports "lzo",
"lz4" and "zstd" algorithm.
compress_log_size=%u Support configuring compress cluster size, the size will
be 4KB * (1 << %u), 16KB is minimum size, also it's
default size.
compress_extension=%s Support adding specified extension, so that f2fs can enable
compression on those corresponding files, e.g. if all files
with '.ext' has high compression rate, we can set the '.ext'
on compression extension list and enable compression on
these file by default rather than to enable it via ioctl.
For other files, we can still enable compression via ioctl.
================================================================================
DEBUGFS ENTRIES
================================================================================
/sys/kernel/debug/f2fs/ contains information about all the partitions mounted as
f2fs. Each file shows the whole f2fs information.
/sys/kernel/debug/f2fs/status includes:
- major file system information managed by f2fs currently
- average SIT information about whole segments
- current memory footprint consumed by f2fs.
================================================================================
SYSFS ENTRIES
================================================================================
Information about mounted f2fs file systems can be found in
/sys/fs/f2fs. Each mounted filesystem will have a directory in
/sys/fs/f2fs based on its device name (i.e., /sys/fs/f2fs/sda).
The files in each per-device directory are shown in table below.
Files in /sys/fs/f2fs/<devname>
(see also Documentation/ABI/testing/sysfs-fs-f2fs)
================================================================================
USAGE
================================================================================
1. Download userland tools and compile them.
2. Skip, if f2fs was compiled statically inside kernel.
Otherwise, insert the f2fs.ko module.
# insmod f2fs.ko
3. Create a directory trying to mount
# mkdir /mnt/f2fs
4. Format the block device, and then mount as f2fs
# mkfs.f2fs -l label /dev/block_device
# mount -t f2fs /dev/block_device /mnt/f2fs
mkfs.f2fs
---------
The mkfs.f2fs is for the use of formatting a partition as the f2fs filesystem,
which builds a basic on-disk layout.
The options consist of:
-l [label] : Give a volume label, up to 512 unicode name.
-a [0 or 1] : Split start location of each area for heap-based allocation.
1 is set by default, which performs this.
-o [int] : Set overprovision ratio in percent over volume size.
5 is set by default.
-s [int] : Set the number of segments per section.
1 is set by default.
-z [int] : Set the number of sections per zone.
1 is set by default.
-e [str] : Set basic extension list. e.g. "mp3,gif,mov"
-t [0 or 1] : Disable discard command or not.
1 is set by default, which conducts discard.
fsck.f2fs
---------
The fsck.f2fs is a tool to check the consistency of an f2fs-formatted
partition, which examines whether the filesystem metadata and user-made data
are cross-referenced correctly or not.
Note that, initial version of the tool does not fix any inconsistency.
The options consist of:
-d debug level [default:0]
dump.f2fs
---------
The dump.f2fs shows the information of specific inode and dumps SSA and SIT to
file. Each file is dump_ssa and dump_sit.
The dump.f2fs is used to debug on-disk data structures of the f2fs filesystem.
It shows on-disk inode information recognized by a given inode number, and is
able to dump all the SSA and SIT entries into predefined files, ./dump_ssa and
./dump_sit respectively.
The options consist of:
-d debug level [default:0]
-i inode no (hex)
-s [SIT dump segno from #1~#2 (decimal), for all 0~-1]
-a [SSA dump segno from #1~#2 (decimal), for all 0~-1]
Examples:
# dump.f2fs -i [ino] /dev/sdx
# dump.f2fs -s 0~-1 /dev/sdx (SIT dump)
# dump.f2fs -a 0~-1 /dev/sdx (SSA dump)
================================================================================
DESIGN
================================================================================
On-disk Layout
--------------
F2FS divides the whole volume into a number of segments, each of which is fixed
to 2MB in size. A section is composed of consecutive segments, and a zone
consists of a set of sections. By default, section and zone sizes are set to one
segment size identically, but users can easily modify the sizes by mkfs.
F2FS splits the entire volume into six areas, and all the areas except superblock
consists of multiple segments as described below.
align with the zone size <-|
|-> align with the segment size
_________________________________________________________________________
| | | Segment | Node | Segment | |
| Superblock | Checkpoint | Info. | Address | Summary | Main |
| (SB) | (CP) | Table (SIT) | Table (NAT) | Area (SSA) | |
|____________|_____2______|______N______|______N______|______N_____|__N___|
. .
. .
. .
._________________________________________.
|_Segment_|_..._|_Segment_|_..._|_Segment_|
. .
._________._________
|_section_|__...__|_
. .
.________.
|__zone__|
- Superblock (SB)
: It is located at the beginning of the partition, and there exist two copies
to avoid file system crash. It contains basic partition information and some
default parameters of f2fs.
- Checkpoint (CP)
: It contains file system information, bitmaps for valid NAT/SIT sets, orphan
inode lists, and summary entries of current active segments.
- Segment Information Table (SIT)
: It contains segment information such as valid block count and bitmap for the
validity of all the blocks.
- Node Address Table (NAT)
: It is composed of a block address table for all the node blocks stored in
Main area.
- Segment Summary Area (SSA)
: It contains summary entries which contains the owner information of all the
data and node blocks stored in Main area.
- Main Area
: It contains file and directory data including their indices.
In order to avoid misalignment between file system and flash-based storage, F2FS
aligns the start block address of CP with the segment size. Also, it aligns the
start block address of Main area with the zone size by reserving some segments
in SSA area.
Reference the following survey for additional technical details.
https://wiki.linaro.org/WorkingGroups/Kernel/Projects/FlashCardSurvey
File System Metadata Structure
------------------------------
F2FS adopts the checkpointing scheme to maintain file system consistency. At
mount time, F2FS first tries to find the last valid checkpoint data by scanning
CP area. In order to reduce the scanning time, F2FS uses only two copies of CP.
One of them always indicates the last valid data, which is called as shadow copy
mechanism. In addition to CP, NAT and SIT also adopt the shadow copy mechanism.
For file system consistency, each CP points to which NAT and SIT copies are
valid, as shown as below.
+--------+----------+---------+
| CP | SIT | NAT |
+--------+----------+---------+
. . . .
. . . .
. . . .
+-------+-------+--------+--------+--------+--------+
| CP #0 | CP #1 | SIT #0 | SIT #1 | NAT #0 | NAT #1 |
+-------+-------+--------+--------+--------+--------+
| ^ ^
| | |
`----------------------------------------'
Index Structure
---------------
The key data structure to manage the data locations is a "node". Similar to
traditional file structures, F2FS has three types of node: inode, direct node,
indirect node. F2FS assigns 4KB to an inode block which contains 923 data block
indices, two direct node pointers, two indirect node pointers, and one double
indirect node pointer as described below. One direct node block contains 1018
data blocks, and one indirect node block contains also 1018 node blocks. Thus,
one inode block (i.e., a file) covers:
4KB * (923 + 2 * 1018 + 2 * 1018 * 1018 + 1018 * 1018 * 1018) := 3.94TB.
Inode block (4KB)
|- data (923)
|- direct node (2)
| `- data (1018)
|- indirect node (2)
| `- direct node (1018)
| `- data (1018)
`- double indirect node (1)
`- indirect node (1018)
`- direct node (1018)
`- data (1018)
Note that, all the node blocks are mapped by NAT which means the location of
each node is translated by the NAT table. In the consideration of the wandering
tree problem, F2FS is able to cut off the propagation of node updates caused by
leaf data writes.
Directory Structure
-------------------
A directory entry occupies 11 bytes, which consists of the following attributes.
- hash hash value of the file name
- ino inode number
- len the length of file name
- type file type such as directory, symlink, etc
A dentry block consists of 214 dentry slots and file names. Therein a bitmap is
used to represent whether each dentry is valid or not. A dentry block occupies
4KB with the following composition.
Dentry Block(4 K) = bitmap (27 bytes) + reserved (3 bytes) +
dentries(11 * 214 bytes) + file name (8 * 214 bytes)
[Bucket]
+--------------------------------+
|dentry block 1 | dentry block 2 |
+--------------------------------+
. .
. .
. [Dentry Block Structure: 4KB] .
+--------+----------+----------+------------+
| bitmap | reserved | dentries | file names |
+--------+----------+----------+------------+
[Dentry Block: 4KB] . .
. .
. .
+------+------+-----+------+
| hash | ino | len | type |
+------+------+-----+------+
[Dentry Structure: 11 bytes]
F2FS implements multi-level hash tables for directory structure. Each level has
a hash table with dedicated number of hash buckets as shown below. Note that
"A(2B)" means a bucket includes 2 data blocks.
----------------------
A : bucket
B : block
N : MAX_DIR_HASH_DEPTH
----------------------
level #0 | A(2B)
|
level #1 | A(2B) - A(2B)
|
level #2 | A(2B) - A(2B) - A(2B) - A(2B)
. | . . . .
level #N/2 | A(2B) - A(2B) - A(2B) - A(2B) - A(2B) - ... - A(2B)
. | . . . .
level #N | A(4B) - A(4B) - A(4B) - A(4B) - A(4B) - ... - A(4B)
The number of blocks and buckets are determined by,
,- 2, if n < MAX_DIR_HASH_DEPTH / 2,
# of blocks in level #n = |
`- 4, Otherwise
,- 2^(n + dir_level),
| if n + dir_level < MAX_DIR_HASH_DEPTH / 2,
# of buckets in level #n = |
`- 2^((MAX_DIR_HASH_DEPTH / 2) - 1),
Otherwise
When F2FS finds a file name in a directory, at first a hash value of the file
name is calculated. Then, F2FS scans the hash table in level #0 to find the
dentry consisting of the file name and its inode number. If not found, F2FS
scans the next hash table in level #1. In this way, F2FS scans hash tables in
each levels incrementally from 1 to N. In each levels F2FS needs to scan only
one bucket determined by the following equation, which shows O(log(# of files))
complexity.
bucket number to scan in level #n = (hash value) % (# of buckets in level #n)
In the case of file creation, F2FS finds empty consecutive slots that cover the
file name. F2FS searches the empty slots in the hash tables of whole levels from
1 to N in the same way as the lookup operation.
The following figure shows an example of two cases holding children.
--------------> Dir <--------------
| |
child child
child - child [hole] - child
child - child - child [hole] - [hole] - child
Case 1: Case 2:
Number of children = 6, Number of children = 3,
File size = 7 File size = 7
Default Block Allocation
------------------------
At runtime, F2FS manages six active logs inside "Main" area: Hot/Warm/Cold node
and Hot/Warm/Cold data.
- Hot node contains direct node blocks of directories.
- Warm node contains direct node blocks except hot node blocks.
- Cold node contains indirect node blocks
- Hot data contains dentry blocks
- Warm data contains data blocks except hot and cold data blocks
- Cold data contains multimedia data or migrated data blocks
LFS has two schemes for free space management: threaded log and copy-and-compac-
tion. The copy-and-compaction scheme which is known as cleaning, is well-suited
for devices showing very good sequential write performance, since free segments
are served all the time for writing new data. However, it suffers from cleaning
overhead under high utilization. Contrarily, the threaded log scheme suffers
from random writes, but no cleaning process is needed. F2FS adopts a hybrid
scheme where the copy-and-compaction scheme is adopted by default, but the
policy is dynamically changed to the threaded log scheme according to the file
system status.
In order to align F2FS with underlying flash-based storage, F2FS allocates a
segment in a unit of section. F2FS expects that the section size would be the
same as the unit size of garbage collection in FTL. Furthermore, with respect
to the mapping granularity in FTL, F2FS allocates each section of the active
logs from different zones as much as possible, since FTL can write the data in
the active logs into one allocation unit according to its mapping granularity.
Cleaning process
----------------
F2FS does cleaning both on demand and in the background. On-demand cleaning is
triggered when there are not enough free segments to serve VFS calls. Background
cleaner is operated by a kernel thread, and triggers the cleaning job when the
system is idle.
F2FS supports two victim selection policies: greedy and cost-benefit algorithms.
In the greedy algorithm, F2FS selects a victim segment having the smallest number
of valid blocks. In the cost-benefit algorithm, F2FS selects a victim segment
according to the segment age and the number of valid blocks in order to address
log block thrashing problem in the greedy algorithm. F2FS adopts the greedy
algorithm for on-demand cleaner, while background cleaner adopts cost-benefit
algorithm.
In order to identify whether the data in the victim segment are valid or not,
F2FS manages a bitmap. Each bit represents the validity of a block, and the
bitmap is composed of a bit stream covering whole blocks in main area.
Fallocate(2) Policy
-------------------
The default policy follows the below posix rule.
Allocating disk space
The default operation (i.e., mode is zero) of fallocate() allocates
the disk space within the range specified by offset and len. The
file size (as reported by stat(2)) will be changed if offset+len is
greater than the file size. Any subregion within the range specified
by offset and len that did not contain data before the call will be
initialized to zero. This default behavior closely resembles the
behavior of the posix_fallocate(3) library function, and is intended
as a method of optimally implementing that function.
However, once F2FS receives ioctl(fd, F2FS_IOC_SET_PIN_FILE) in prior to
fallocate(fd, DEFAULT_MODE), it allocates on-disk blocks addressess having
zero or random data, which is useful to the below scenario where:
1. create(fd)
2. ioctl(fd, F2FS_IOC_SET_PIN_FILE)
3. fallocate(fd, 0, 0, size)
4. address = fibmap(fd, offset)
5. open(blkdev)
6. write(blkdev, address)
Compression implementation
--------------------------
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special block address is used to indicate
cluster is compressed one or normal one, for compressed cluster, following
metadata maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs
stores data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+

View File

@ -27,9 +27,9 @@ automatically verified against the file's Merkle tree. Reads of any
corrupted data, including mmap reads, will fail.
Userspace can use another ioctl to retrieve the root hash (actually
the "file measurement", which is a hash that includes the root hash)
that fs-verity is enforcing for the file. This ioctl executes in
constant time, regardless of the file size.
the "fs-verity file digest", which is a hash that includes the Merkle
tree root hash) that fs-verity is enforcing for the file. This ioctl
executes in constant time, regardless of the file size.
fs-verity is essentially a way to hash a file in constant time,
subject to the caveat that reads which would violate the hash will
@ -177,9 +177,10 @@ FS_IOC_ENABLE_VERITY can fail with the following errors:
FS_IOC_MEASURE_VERITY
---------------------
The FS_IOC_MEASURE_VERITY ioctl retrieves the measurement of a verity
file. The file measurement is a digest that cryptographically
identifies the file contents that are being enforced on reads.
The FS_IOC_MEASURE_VERITY ioctl retrieves the digest of a verity file.
The fs-verity file digest is a cryptographic digest that identifies
the file contents that are being enforced on reads; it is computed via
a Merkle tree and is different from a traditional full-file digest.
This ioctl takes in a pointer to a variable-length structure::
@ -197,7 +198,7 @@ On success, 0 is returned and the kernel fills in the structure as
follows:
- ``digest_algorithm`` will be the hash algorithm used for the file
measurement. It will match ``fsverity_enable_arg::hash_algorithm``.
digest. It will match ``fsverity_enable_arg::hash_algorithm``.
- ``digest_size`` will be the size of the digest in bytes, e.g. 32
for SHA-256. (This can be redundant with ``digest_algorithm``.)
- ``digest`` will be the actual bytes of the digest.
@ -216,6 +217,82 @@ FS_IOC_MEASURE_VERITY can fail with the following errors:
- ``EOVERFLOW``: the digest is longer than the specified
``digest_size`` bytes. Try providing a larger buffer.
FS_IOC_READ_VERITY_METADATA
---------------------------
The FS_IOC_READ_VERITY_METADATA ioctl reads verity metadata from a
verity file. This ioctl is available since Linux v5.12.
This ioctl allows writing a server program that takes a verity file
and serves it to a client program, such that the client can do its own
fs-verity compatible verification of the file. This only makes sense
if the client doesn't trust the server and if the server needs to
provide the storage for the client.
This is a fairly specialized use case, and most fs-verity users won't
need this ioctl.
This ioctl takes in a pointer to the following structure::
#define FS_VERITY_METADATA_TYPE_MERKLE_TREE 1
#define FS_VERITY_METADATA_TYPE_DESCRIPTOR 2
#define FS_VERITY_METADATA_TYPE_SIGNATURE 3
struct fsverity_read_metadata_arg {
__u64 metadata_type;
__u64 offset;
__u64 length;
__u64 buf_ptr;
__u64 __reserved;
};
``metadata_type`` specifies the type of metadata to read:
- ``FS_VERITY_METADATA_TYPE_MERKLE_TREE`` reads the blocks of the
Merkle tree. The blocks are returned in order from the root level
to the leaf level. Within each level, the blocks are returned in
the same order that their hashes are themselves hashed.
See `Merkle tree`_ for more information.
- ``FS_VERITY_METADATA_TYPE_DESCRIPTOR`` reads the fs-verity
descriptor. See `fs-verity descriptor`_.
- ``FS_VERITY_METADATA_TYPE_SIGNATURE`` reads the signature which was
passed to FS_IOC_ENABLE_VERITY, if any. See `Built-in signature
verification`_.
The semantics are similar to those of ``pread()``. ``offset``
specifies the offset in bytes into the metadata item to read from, and
``length`` specifies the maximum number of bytes to read from the
metadata item. ``buf_ptr`` is the pointer to the buffer to read into,
cast to a 64-bit integer. ``__reserved`` must be 0. On success, the
number of bytes read is returned. 0 is returned at the end of the
metadata item. The returned length may be less than ``length``, for
example if the ioctl is interrupted.
The metadata returned by FS_IOC_READ_VERITY_METADATA isn't guaranteed
to be authenticated against the file digest that would be returned by
`FS_IOC_MEASURE_VERITY`_, as the metadata is expected to be used to
implement fs-verity compatible verification anyway (though absent a
malicious disk, the metadata will indeed match). E.g. to implement
this ioctl, the filesystem is allowed to just read the Merkle tree
blocks from disk without actually verifying the path to the root node.
FS_IOC_READ_VERITY_METADATA can fail with the following errors:
- ``EFAULT``: the caller provided inaccessible memory
- ``EINTR``: the ioctl was interrupted before any data was read
- ``EINVAL``: reserved fields were set, or ``offset + length``
overflowed
- ``ENODATA``: the file is not a verity file, or
FS_VERITY_METADATA_TYPE_SIGNATURE was requested but the file doesn't
have a built-in signature
- ``ENOTTY``: this type of filesystem does not implement fs-verity, or
this ioctl is not yet implemented on it
- ``EOPNOTSUPP``: the kernel was not configured with fs-verity
support, or the filesystem superblock has not had the 'verity'
feature enabled on it. (See `Filesystem support`_.)
FS_IOC_GETFLAGS
---------------
@ -257,25 +334,24 @@ non-verity one, with the following exceptions:
with EIO (for read()) or SIGBUS (for mmap() reads).
- If the sysctl "fs.verity.require_signatures" is set to 1 and the
file's verity measurement is not signed by a key in the fs-verity
keyring, then opening the file will fail. See `Built-in signature
verification`_.
file is not signed by a key in the fs-verity keyring, then opening
the file will fail. See `Built-in signature verification`_.
Direct access to the Merkle tree is not supported. Therefore, if a
verity file is copied, or is backed up and restored, then it will lose
its "verity"-ness. fs-verity is primarily meant for files like
executables that are managed by a package manager.
File measurement computation
============================
File digest computation
=======================
This section describes how fs-verity hashes the file contents using a
Merkle tree to produce the "file measurement" which cryptographically
identifies the file contents. This algorithm is the same for all
filesystems that support fs-verity.
Merkle tree to produce the digest which cryptographically identifies
the file contents. This algorithm is the same for all filesystems
that support fs-verity.
Userspace only needs to be aware of this algorithm if it needs to
compute the file measurement itself, e.g. in order to sign the file.
compute fs-verity file digests itself, e.g. in order to sign files.
.. _fsverity_merkle_tree:
@ -325,26 +401,22 @@ can't a distinguish a large file from a small second file whose data
is exactly the top-level hash block of the first file. Ambiguities
also arise from the convention of padding to the next block boundary.
To solve this problem, the verity file measurement is actually
computed as a hash of the following structure, which contains the
Merkle tree root hash as well as other fields such as the file size::
To solve this problem, the fs-verity file digest is actually computed
as a hash of the following structure, which contains the Merkle tree
root hash as well as other fields such as the file size::
struct fsverity_descriptor {
__u8 version; /* must be 1 */
__u8 hash_algorithm; /* Merkle tree hash algorithm */
__u8 log_blocksize; /* log2 of size of data and tree blocks */
__u8 salt_size; /* size of salt in bytes; 0 if none */
__le32 sig_size; /* must be 0 */
__le32 __reserved_0x04; /* must be 0 */
__le64 data_size; /* size of file the Merkle tree is built over */
__u8 root_hash[64]; /* Merkle tree root hash */
__u8 salt[32]; /* salt prepended to each hashed block */
__u8 __reserved[144]; /* must be 0's */
};
Note that the ``sig_size`` field must be set to 0 for the purpose of
computing the file measurement, even if a signature was provided (or
will be provided) to `FS_IOC_ENABLE_VERITY`_.
Built-in signature verification
===============================
@ -359,20 +431,20 @@ kernel. Specifically, it adds support for:
certificates from being added.
2. `FS_IOC_ENABLE_VERITY`_ accepts a pointer to a PKCS#7 formatted
detached signature in DER format of the file measurement. On
success, this signature is persisted alongside the Merkle tree.
detached signature in DER format of the file's fs-verity digest.
On success, this signature is persisted alongside the Merkle tree.
Then, any time the file is opened, the kernel will verify the
file's actual measurement against this signature, using the
certificates in the ".fs-verity" keyring.
file's actual digest against this signature, using the certificates
in the ".fs-verity" keyring.
3. A new sysctl "fs.verity.require_signatures" is made available.
When set to 1, the kernel requires that all verity files have a
correctly signed file measurement as described in (2).
correctly signed digest as described in (2).
File measurements must be signed in the following format, which is
similar to the structure used by `FS_IOC_MEASURE_VERITY`_::
fs-verity file digests must be signed in the following format, which
is similar to the structure used by `FS_IOC_MEASURE_VERITY`_::
struct fsverity_signed_digest {
struct fsverity_formatted_digest {
char magic[8]; /* must be "FSVerity" */
__le16 digest_algorithm;
__le16 digest_size;
@ -421,8 +493,8 @@ can only be set by `FS_IOC_ENABLE_VERITY`_, and it cannot be cleared.
ext4 also supports encryption, which can be used simultaneously with
fs-verity. In this case, the plaintext data is verified rather than
the ciphertext. This is necessary in order to make the file
measurement meaningful, since every file is encrypted differently.
the ciphertext. This is necessary in order to make the fs-verity file
digest meaningful, since every file is encrypted differently.
ext4 stores the verity metadata (Merkle tree and fsverity_descriptor)
past the end of the file, starting at the first 64K boundary beyond
@ -592,8 +664,8 @@ weren't already directly answered in other parts of this document.
:Q: Isn't fs-verity useless because the attacker can just modify the
hashes in the Merkle tree, which is stored on-disk?
:A: To verify the authenticity of an fs-verity file you must verify
the authenticity of the "file measurement", which is basically the
root hash of the Merkle tree. See `Use cases`_.
the authenticity of the "fs-verity file digest", which
incorporates the root hash of the Merkle tree. See `Use cases`_.
:Q: Isn't fs-verity useless because the attacker can just replace a
verity file with a non-verity one?

View File

@ -5277,6 +5277,7 @@ F: Documentation/ABI/testing/sysfs-fs-f2fs
F: fs/f2fs/
F: include/linux/f2fs_fs.h
F: include/trace/events/f2fs.h
F: include/uapi/linux/f2fs.h
F71805F HARDWARE MONITORING DRIVER
M: Jean Delvare <jdelvare@suse.com>

View File

@ -517,6 +517,7 @@ static int dax_open(struct inode *inode, struct file *filp)
inode->i_mapping->host = __dax_inode;
filp->f_mapping = inode->i_mapping;
filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
filp->f_sb_err = file_sample_sb_err(filp);
filp->private_data = dev_dax;
inode->i_flags = S_DAX;

View File

@ -1142,12 +1142,19 @@ EXPORT_SYMBOL(mark_buffer_dirty);
void mark_buffer_write_io_error(struct buffer_head *bh)
{
struct super_block *sb;
set_buffer_write_io_error(bh);
/* FIXME: do we need to set this in both places? */
if (bh->b_page && bh->b_page->mapping)
mapping_set_error(bh->b_page->mapping, -EIO);
if (bh->b_assoc_map)
mapping_set_error(bh->b_assoc_map, -EIO);
rcu_read_lock();
sb = READ_ONCE(bh->b_bdev->bd_super);
if (sb)
errseq_set(&sb->s_wb_err, -EIO);
rcu_read_unlock();
}
EXPORT_SYMBOL(mark_buffer_write_io_error);

View File

@ -415,9 +415,9 @@ EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
* directory's encryption key, then @iname is the plaintext, so we encrypt it to
* get the disk_name.
*
* Else, for keyless @lookup operations, @iname is the presented ciphertext, so
* we decode it to get the fscrypt_nokey_name. Non-@lookup operations will be
* impossible in this case, so we fail them with ENOKEY.
* Else, for keyless @lookup operations, @iname should be a no-key name, so we
* decode it to get the struct fscrypt_nokey_name. Non-@lookup operations will
* be impossible in this case, so we fail them with ENOKEY.
*
* If successful, fscrypt_free_filename() must be called later to clean up.
*
@ -461,7 +461,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
}
if (!lookup)
return -ENOKEY;
fname->is_ciphertext_name = true;
fname->is_nokey_name = true;
/*
* We don't have the key and we are doing a lookup; decode the
@ -571,17 +571,17 @@ int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
/*
* Plaintext names are always valid, since fscrypt doesn't support
* reverting to ciphertext names without evicting the directory's inode
* reverting to no-key names without evicting the directory's inode
* -- which implies eviction of the dentries in the directory.
*/
if (!(dentry->d_flags & DCACHE_ENCRYPTED_NAME))
if (!(dentry->d_flags & DCACHE_NOKEY_NAME))
return 1;
/*
* Ciphertext name; valid if the directory's key is still unavailable.
* No-key name; valid if the directory's key is still unavailable.
*
* Although fscrypt forbids rename() on ciphertext names, we still must
* use dget_parent() here rather than use ->d_parent directly. That's
* Although fscrypt forbids rename() on no-key names, we still must use
* dget_parent() here rather than use ->d_parent directly. That's
* because a corrupted fs image may contain directory hard links, which
* the VFS handles by moving the directory's dentry tree in the dcache
* each time ->lookup() finds the directory and it already has a dentry
@ -602,4 +602,4 @@ int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
return valid;
}
EXPORT_SYMBOL(fscrypt_d_revalidate);
EXPORT_SYMBOL_GPL(fscrypt_d_revalidate);

View File

@ -59,8 +59,8 @@ int __fscrypt_prepare_link(struct inode *inode, struct inode *dir,
if (err)
return err;
/* ... in case we looked up ciphertext name before key was added */
if (dentry->d_flags & DCACHE_ENCRYPTED_NAME)
/* ... in case we looked up no-key name before key was added */
if (dentry->d_flags & DCACHE_NOKEY_NAME)
return -ENOKEY;
if (!fscrypt_has_permitted_context(dir, inode))
@ -84,9 +84,8 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
if (err)
return err;
/* ... in case we looked up ciphertext name(s) before key was added */
if ((old_dentry->d_flags | new_dentry->d_flags) &
DCACHE_ENCRYPTED_NAME)
/* ... in case we looked up no-key name(s) before key was added */
if ((old_dentry->d_flags | new_dentry->d_flags) & DCACHE_NOKEY_NAME)
return -ENOKEY;
if (old_dir != new_dir) {
@ -113,9 +112,9 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
if (err && err != -ENOENT)
return err;
if (fname->is_ciphertext_name) {
if (fname->is_nokey_name) {
spin_lock(&dentry->d_lock);
dentry->d_flags |= DCACHE_ENCRYPTED_NAME;
dentry->d_flags |= DCACHE_NOKEY_NAME;
spin_unlock(&dentry->d_lock);
}
return err;

View File

@ -3157,10 +3157,6 @@ static inline void ext4_unlock_group(struct super_block *sb,
/* dir.c */
extern const struct file_operations ext4_dir_operations;
#ifdef CONFIG_UNICODE
extern const struct dentry_operations ext4_dentry_ops;
#endif
/* file.c */
extern const struct inode_operations ext4_file_inode_operations;
extern const struct file_operations ext4_file_operations;

View File

@ -1178,6 +1178,12 @@ out:
return -EOPNOTSUPP;
return fsverity_ioctl_measure(filp, (void __user *)arg);
case FS_IOC_READ_VERITY_METADATA:
if (!ext4_has_feature_verity(sb))
return -EOPNOTSUPP;
return fsverity_ioctl_read_metadata(filp,
(const void __user *)arg);
default:
return -ENOTTY;
}
@ -1256,6 +1262,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case FS_IOC_MEASURE_VERITY:
case EXT4_IOC_FSGETXATTR:
case EXT4_IOC_FSSETXATTR:
case FS_IOC_READ_VERITY_METADATA:
break;
default:
return -ENOIOCTLCMD;

View File

@ -1689,7 +1689,7 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir,
struct buffer_head *bh;
err = ext4_fname_prepare_lookup(dir, dentry, &fname);
generic_set_encrypted_ci_d_ops(dir, dentry);
generic_set_encrypted_ci_d_ops(dentry);
if (err == -ENOENT)
return NULL;
if (err)

View File

@ -6,6 +6,13 @@ config F2FS_FS
select CRYPTO_CRC32
select F2FS_FS_XATTR if FS_ENCRYPTION
select FS_ENCRYPTION_ALGS if FS_ENCRYPTION
select LZ4_COMPRESS if F2FS_FS_LZ4
select LZ4_DECOMPRESS if F2FS_FS_LZ4
select LZ4HC_COMPRESS if F2FS_FS_LZ4HC
select LZO_COMPRESS if F2FS_FS_LZO
select LZO_DECOMPRESS if F2FS_FS_LZO
select ZSTD_COMPRESS if F2FS_FS_ZSTD
select ZSTD_DECOMPRESS if F2FS_FS_ZSTD
help
F2FS is based on Log-structured File System (LFS), which supports
versatile "flash-friendly" features. The design has been focused on
@ -88,16 +95,6 @@ config F2FS_FS_ENCRYPTION
FS_ENCRYPTION. Use CONFIG_FS_ENCRYPTION=y in new config
files.
config F2FS_IO_TRACE
bool "F2FS IO tracer"
depends on F2FS_FS
depends on FUNCTION_TRACER
help
F2FS IO trace is based on a function trace, which gathers process
information and block IO patterns in the filesystem level.
If unsure, say N.
config F2FS_FAULT_INJECTION
bool "F2FS fault injection facility"
depends on F2FS_FS
@ -116,26 +113,44 @@ config F2FS_FS_COMPRESSION
config F2FS_FS_LZO
bool "LZO compression support"
depends on F2FS_FS_COMPRESSION
select LZO_COMPRESS
select LZO_DECOMPRESS
default y
help
Support LZO compress algorithm, if unsure, say Y.
config F2FS_FS_LZORLE
bool "LZO-RLE compression support"
depends on F2FS_FS_LZO
default y
help
Support LZO-RLE compress algorithm, if unsure, say Y.
config F2FS_FS_LZ4
bool "LZ4 compression support"
depends on F2FS_FS_COMPRESSION
select LZ4_COMPRESS
select LZ4_DECOMPRESS
default y
help
Support LZ4 compress algorithm, if unsure, say Y.
config F2FS_FS_LZ4HC
bool "LZ4HC compression support"
depends on F2FS_FS_LZ4
default y
help
Support LZ4HC compress algorithm, LZ4HC has compatible on-disk
layout with LZ4, if unsure, say Y.
config F2FS_FS_ZSTD
bool "ZSTD compression support"
depends on F2FS_FS_COMPRESSION
select ZSTD_COMPRESS
select ZSTD_DECOMPRESS
default y
help
Support ZSTD compress algorithm, if unsure, say Y.
config F2FS_IOSTAT
bool "F2FS IO statistics information"
depends on F2FS_FS
default y
help
Support getting IO statistics through sysfs and printing out periodic
IO statistics tracepoint events. You have to turn on "iostat_enable"
sysfs node to enable this feature.

View File

@ -7,6 +7,6 @@ f2fs-y += shrinker.o extent_cache.o sysfs.o
f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o
f2fs-$(CONFIG_FS_VERITY) += verity.o
f2fs-$(CONFIG_F2FS_FS_COMPRESSION) += compress.o
f2fs-$(CONFIG_F2FS_IOSTAT) += iostat.o

View File

@ -29,6 +29,7 @@ static inline size_t f2fs_acl_size(int count)
static inline int f2fs_acl_count(size_t size)
{
ssize_t s;
size -= sizeof(struct f2fs_acl_header);
s = size - 4 * sizeof(struct f2fs_acl_entry_short);
if (s < 0) {
@ -160,7 +161,7 @@ static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi,
return (void *)f2fs_acl;
fail:
kvfree(f2fs_acl);
kfree(f2fs_acl);
return ERR_PTR(-EINVAL);
}
@ -190,7 +191,7 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type,
acl = NULL;
else
acl = ERR_PTR(retval);
kvfree(value);
kfree(value);
return acl;
}
@ -200,6 +201,27 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
return __f2fs_get_acl(inode, type, NULL);
}
static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p,
struct posix_acl **acl)
{
umode_t mode = inode->i_mode;
int error;
if (is_inode_flag_set(inode, FI_ACL_MODE))
mode = F2FS_I(inode)->i_acl_mode;
error = posix_acl_equiv_mode(*acl, &mode);
if (error < 0)
return error;
if (error == 0)
*acl = NULL;
if (!in_group_p(inode->i_gid) &&
!capable_wrt_inode_uidgid(inode, CAP_FSETID))
mode &= ~S_ISGID;
*mode_p = mode;
return 0;
}
static int __f2fs_set_acl(struct inode *inode, int type,
struct posix_acl *acl, struct page *ipage)
{
@ -213,7 +235,7 @@ static int __f2fs_set_acl(struct inode *inode, int type,
case ACL_TYPE_ACCESS:
name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
if (acl && !ipage) {
error = posix_acl_update_mode(inode, &mode, &acl);
error = f2fs_acl_update_mode(inode, &mode, &acl);
if (error)
return error;
set_acl_inode(inode, mode);
@ -240,7 +262,7 @@ static int __f2fs_set_acl(struct inode *inode, int type,
error = f2fs_setxattr(inode, name_index, "", value, size, ipage, 0);
kvfree(value);
kfree(value);
if (!error)
set_cached_acl(inode, type, acl);
@ -384,7 +406,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
struct page *dpage)
{
struct posix_acl *default_acl = NULL, *acl = NULL;
int error = 0;
int error;
error = f2fs_acl_create(dir, &inode->i_mode, &default_acl, &acl, dpage);
if (error)

View File

@ -13,13 +13,16 @@
#include <linux/f2fs_fs.h>
#include <linux/pagevec.h>
#include <linux/swap.h>
#include <linux/kthread.h>
#include "f2fs.h"
#include "node.h"
#include "segment.h"
#include "trace.h"
#include "iostat.h"
#include <trace/events/f2fs.h>
#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
static struct kmem_cache *ino_entry_slab;
struct kmem_cache *f2fs_inode_entry_slab;
@ -37,7 +40,7 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io)
struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
{
struct address_space *mapping = META_MAPPING(sbi);
struct page *page = NULL;
struct page *page;
repeat:
page = f2fs_grab_cache_page(mapping, index, false);
if (!page) {
@ -107,7 +110,7 @@ struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
return __get_meta_page(sbi, index, true);
}
struct page *f2fs_get_meta_page_nofail(struct f2fs_sb_info *sbi, pgoff_t index)
struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index)
{
struct page *page;
int count = 0;
@ -279,18 +282,22 @@ out:
return blkno - start;
}
void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index,
unsigned int ra_blocks)
{
struct page *page;
bool readahead = false;
if (ra_blocks == RECOVERY_MIN_RA_BLOCKS)
return;
page = find_get_page(META_MAPPING(sbi), index);
if (!page || !PageUptodate(page))
readahead = true;
f2fs_put_page(page, 0);
if (readahead)
f2fs_ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true);
f2fs_ra_meta_pages(sbi, index, ra_blocks, META_POR, true);
}
static int __f2fs_write_meta_page(struct page *page,
@ -348,13 +355,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
goto skip_write;
/* if locked failed, cp will flush dirty pages instead */
if (!mutex_trylock(&sbi->cp_mutex))
if (!f2fs_down_write_trylock(&sbi->cp_global_sem))
goto skip_write;
trace_f2fs_writepages(mapping->host, wbc, META);
diff = nr_pages_to_write(sbi, META, wbc);
written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO);
mutex_unlock(&sbi->cp_mutex);
f2fs_up_write(&sbi->cp_global_sem);
wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
return 0;
@ -442,8 +449,7 @@ static int f2fs_set_meta_page_dirty(struct page *page)
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
f2fs_set_page_private(page, 0);
f2fs_trace_pid(page);
set_page_private_reference(page);
return 1;
}
return 0;
@ -464,16 +470,29 @@ static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino,
unsigned int devidx, int type)
{
struct inode_management *im = &sbi->im[type];
struct ino_entry *e, *tmp;
struct ino_entry *e = NULL, *new = NULL;
tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS);
if (type == FLUSH_INO) {
rcu_read_lock();
e = radix_tree_lookup(&im->ino_root, ino);
rcu_read_unlock();
}
retry:
if (!e)
new = f2fs_kmem_cache_alloc(ino_entry_slab,
GFP_NOFS, true, NULL);
radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
spin_lock(&im->ino_lock);
e = radix_tree_lookup(&im->ino_root, ino);
if (!e) {
e = tmp;
if (!new) {
spin_unlock(&im->ino_lock);
goto retry;
}
e = new;
if (unlikely(radix_tree_insert(&im->ino_root, ino, e)))
f2fs_bug_on(sbi, 1);
@ -491,8 +510,8 @@ static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino,
spin_unlock(&im->ino_lock);
radix_tree_preload_end();
if (e != tmp)
kmem_cache_free(ino_entry_slab, tmp);
if (new && e != new)
kmem_cache_free(ino_entry_slab, new);
}
static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
@ -525,7 +544,7 @@ void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
__remove_ino_entry(sbi, ino, type);
}
/* mode should be APPEND_INO or UPDATE_INO */
/* mode should be APPEND_INO, UPDATE_INO or TRANS_DIR_INO */
bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
{
struct inode_management *im = &sbi->im[mode];
@ -638,7 +657,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
return PTR_ERR(inode);
}
err = dquot_initialize(inode);
err = f2fs_dquot_initialize(inode);
if (err) {
iput(inode);
goto err_out;
@ -649,7 +668,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
/* truncate all the data during iput */
iput(inode);
err = f2fs_get_node_info(sbi, ino, &ni);
err = f2fs_get_node_info(sbi, ino, &ni, false);
if (err)
goto err_out;
@ -690,9 +709,6 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi)
}
#ifdef CONFIG_QUOTA
/* Needed for iput() to work correctly and not trash data */
sbi->sb->s_flags |= MS_ACTIVE;
/*
* Turn on quotas which were not enabled for read-only mounts if
* filesystem has quota feature, so that they are updated correctly.
@ -718,6 +734,7 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi)
orphan_blk = (struct f2fs_orphan_block *)page_address(page);
for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) {
nid_t ino = le32_to_cpu(orphan_blk->ino[j]);
err = recover_orphan_inode(sbi, ino);
if (err) {
f2fs_put_page(page, 1);
@ -851,6 +868,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
struct page *cp_page_1 = NULL, *cp_page_2 = NULL;
struct f2fs_checkpoint *cp_block = NULL;
unsigned long long cur_version = 0, pre_version = 0;
unsigned int cp_blocks;
int err;
err = get_checkpoint_version(sbi, cp_addr, &cp_block,
@ -858,15 +876,16 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
if (err)
return NULL;
if (le32_to_cpu(cp_block->cp_pack_total_block_count) >
sbi->blocks_per_seg) {
cp_blocks = le32_to_cpu(cp_block->cp_pack_total_block_count);
if (cp_blocks > sbi->blocks_per_seg || cp_blocks <= F2FS_CP_PACKS) {
f2fs_warn(sbi, "invalid cp_pack_total_block_count:%u",
le32_to_cpu(cp_block->cp_pack_total_block_count));
goto invalid_cp;
}
pre_version = *version;
cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
cp_addr += cp_blocks - 1;
err = get_checkpoint_version(sbi, cp_addr, &cp_block,
&cp_page_2, version);
if (err)
@ -1016,8 +1035,7 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page)
inode_inc_dirty_pages(inode);
spin_unlock(&sbi->inode_lock[type]);
f2fs_set_page_private(page, 0);
f2fs_trace_pid(page);
set_page_private_reference(page);
}
void f2fs_remove_dirty_inode(struct inode *inode)
@ -1147,7 +1165,8 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi)
if (!is_journalled_quota(sbi))
return false;
down_write(&sbi->quota_sem);
if (!f2fs_down_write_trylock(&sbi->quota_sem))
return true;
if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) {
ret = false;
} else if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) {
@ -1158,7 +1177,7 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi)
} else if (get_pages(sbi, F2FS_DIRTY_QDATA)) {
ret = true;
}
up_write(&sbi->quota_sem);
f2fs_up_write(&sbi->quota_sem);
return ret;
}
@ -1215,10 +1234,10 @@ retry_flush_dents:
* POR: we should ensure that there are no dirty node pages
* until finishing nat/sit flush. inode->i_blocks can be updated.
*/
down_write(&sbi->node_change);
f2fs_down_write(&sbi->node_change);
if (get_pages(sbi, F2FS_DIRTY_IMETA)) {
up_write(&sbi->node_change);
f2fs_up_write(&sbi->node_change);
f2fs_unlock_all(sbi);
err = f2fs_sync_inode_meta(sbi);
if (err)
@ -1228,15 +1247,15 @@ retry_flush_dents:
}
retry_flush_nodes:
down_write(&sbi->node_write);
f2fs_down_write(&sbi->node_write);
if (get_pages(sbi, F2FS_DIRTY_NODES)) {
up_write(&sbi->node_write);
f2fs_up_write(&sbi->node_write);
atomic_inc(&sbi->wb_sync_req[NODE]);
err = f2fs_sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO);
atomic_dec(&sbi->wb_sync_req[NODE]);
if (err) {
up_write(&sbi->node_change);
f2fs_up_write(&sbi->node_change);
f2fs_unlock_all(sbi);
return err;
}
@ -1249,13 +1268,13 @@ retry_flush_nodes:
* dirty node blocks and some checkpoint values by block allocation.
*/
__prepare_cp_block(sbi);
up_write(&sbi->node_change);
f2fs_up_write(&sbi->node_change);
return err;
}
static void unblock_operations(struct f2fs_sb_info *sbi)
{
up_write(&sbi->node_write);
f2fs_up_write(&sbi->node_write);
f2fs_unlock_all(sbi);
}
@ -1264,8 +1283,6 @@ void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type)
DEFINE_WAIT(wait);
for (;;) {
prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
if (!get_pages(sbi, type))
break;
@ -1275,6 +1292,10 @@ void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type)
if (type == F2FS_DIRTY_META)
f2fs_sync_meta_pages(sbi, META, LONG_MAX,
FS_CP_META_IO);
else if (type == F2FS_WB_CP_DATA)
f2fs_submit_merged_write(sbi, DATA);
prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
io_schedule_timeout(DEFAULT_IO_TIMEOUT);
}
finish_wait(&sbi->cp_wait, &wait);
@ -1286,12 +1307,20 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
unsigned long flags;
spin_lock_irqsave(&sbi->cp_lock, flags);
if (cpc->reason & CP_UMOUNT) {
if (le32_to_cpu(ckpt->cp_pack_total_block_count) +
NM_I(sbi)->nat_bits_blocks > sbi->blocks_per_seg) {
clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
f2fs_notice(sbi, "Disable nat_bits due to no space");
} else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) &&
f2fs_nat_bitmap_enabled(sbi)) {
f2fs_enable_nat_bits(sbi);
set_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
f2fs_notice(sbi, "Rebuild and enable nat_bits");
}
}
if ((cpc->reason & CP_UMOUNT) &&
le32_to_cpu(ckpt->cp_pack_total_block_count) >
sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks)
disable_nat_bits(sbi, false);
spin_lock_irqsave(&sbi->cp_lock, flags);
if (cpc->reason & CP_TRIMMED)
__set_ckpt_flags(ckpt, CP_TRIMMED_FLAG);
@ -1383,6 +1412,26 @@ static void commit_checkpoint(struct f2fs_sb_info *sbi,
f2fs_submit_merged_write(sbi, META_FLUSH);
}
static inline u64 get_sectors_written(struct block_device *bdev)
{
return (u64)part_stat_read(bdev->bd_part, sectors[1]);
}
u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi)
{
if (f2fs_is_multi_device(sbi)) {
u64 sectors = 0;
int i;
for (i = 0; i < sbi->s_ndevs; i++)
sectors += get_sectors_written(FDEV(i).bdev);
return sectors;
}
return get_sectors_written(sbi->sb->s_bdev);
}
static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@ -1393,7 +1442,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
__u32 crc32 = 0;
int i;
int cp_payload_blks = __cp_payload(sbi);
struct super_block *sb = sbi->sb;
struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
u64 kbytes_written;
int err;
@ -1421,7 +1469,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
curseg_alloc_type(sbi, i + CURSEG_HOT_DATA);
}
/* 2 cp + n data seg summary + orphan inode blocks */
/* 2 cp + n data seg summary + orphan inode blocks */
data_sum_blocks = f2fs_npages_for_summary_flush(sbi, false);
spin_lock_irqsave(&sbi->cp_lock, flags);
if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
@ -1435,7 +1483,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
orphan_blocks);
if (__remain_node_summaries(cpc->reason))
ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
cp_payload_blks + data_sum_blocks +
orphan_blocks + NR_CURSEG_NODE_TYPE);
else
@ -1458,7 +1506,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
start_blk = __start_cp_next_addr(sbi);
/* write nat bits */
if (enabled_nat_bits(sbi, cpc)) {
if ((cpc->reason & CP_UMOUNT) &&
is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) {
__u64 cp_ver = cur_cp_version(ckpt);
block_t blk;
@ -1488,9 +1537,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* Record write statistics in the hot node summary */
kbytes_written = sbi->kbytes_written;
if (sb->s_bdev->bd_part)
kbytes_written += BD_PART_WRITTEN(sbi);
kbytes_written += (f2fs_get_sectors_written(sbi) -
sbi->sectors_written_start) >> 1;
seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written);
if (__remain_node_summaries(cpc->reason)) {
@ -1501,6 +1549,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* update user_block_counts */
sbi->last_valid_block_count = sbi->total_valid_block_count;
percpu_counter_set(&sbi->alloc_valid_block_count, 0);
percpu_counter_set(&sbi->rf_node_block_count, 0);
/* Here, we have one bio having CP pack except cp pack 2 page */
f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
@ -1521,9 +1570,10 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/*
* invalidate intermediate page cache borrowed from meta inode which are
* used for migration of encrypted or verity inode's blocks.
* used for migration of encrypted, verity or compressed inode's blocks.
*/
if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi))
if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi) ||
f2fs_sb_has_compression(sbi))
invalidate_mapping_pages(META_MAPPING(sbi),
MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1);
@ -1569,7 +1619,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_warn(sbi, "Start checkpoint disabled!");
}
if (cpc->reason != CP_RESIZE)
mutex_lock(&sbi->cp_mutex);
f2fs_down_write(&sbi->cp_global_sem);
if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) ||
@ -1597,7 +1647,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
goto out;
}
if (NM_I(sbi)->dirty_nat_cnt == 0 &&
if (NM_I(sbi)->nat_cnt[DIRTY_NAT] == 0 &&
SIT_I(sbi)->dirty_sentries == 0 &&
prefree_segments(sbi) == 0) {
f2fs_flush_sit_entries(sbi, cpc);
@ -1617,16 +1667,27 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* write cached NAT/SIT entries to NAT/SIT area */
err = f2fs_flush_nat_entries(sbi, cpc);
if (err)
if (err) {
f2fs_err(sbi, "f2fs_flush_nat_entries failed err:%d, stop checkpoint", err);
f2fs_bug_on(sbi, !f2fs_cp_error(sbi));
goto stop;
}
f2fs_flush_sit_entries(sbi, cpc);
/* save inmem log status */
f2fs_save_inmem_curseg(sbi);
err = do_checkpoint(sbi, cpc);
if (err)
if (err) {
f2fs_err(sbi, "do_checkpoint failed err:%d, stop checkpoint", err);
f2fs_bug_on(sbi, !f2fs_cp_error(sbi));
f2fs_release_discard_addrs(sbi);
else
} else {
f2fs_clear_prefree_segments(sbi, cpc);
}
f2fs_restore_inmem_curseg(sbi);
stop:
unblock_operations(sbi);
stat_inc_cp_count(sbi->stat_info);
@ -1639,7 +1700,7 @@ stop:
trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
out:
if (cpc->reason != CP_RESIZE)
mutex_unlock(&sbi->cp_mutex);
f2fs_up_write(&sbi->cp_global_sem);
return err;
}
@ -1657,7 +1718,7 @@ void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi)
}
sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS -
NR_CURSEG_TYPE - __cp_payload(sbi)) *
NR_CURSEG_PERSIST_TYPE - __cp_payload(sbi)) *
F2FS_ORPHANS_PER_BLOCK;
}
@ -1681,3 +1742,178 @@ void f2fs_destroy_checkpoint_caches(void)
kmem_cache_destroy(ino_entry_slab);
kmem_cache_destroy(f2fs_inode_entry_slab);
}
static int __write_checkpoint_sync(struct f2fs_sb_info *sbi)
{
struct cp_control cpc = { .reason = CP_SYNC, };
int err;
f2fs_down_write(&sbi->gc_lock);
err = f2fs_write_checkpoint(sbi, &cpc);
f2fs_up_write(&sbi->gc_lock);
return err;
}
static void __checkpoint_and_complete_reqs(struct f2fs_sb_info *sbi)
{
struct ckpt_req_control *cprc = &sbi->cprc_info;
struct ckpt_req *req, *next;
struct llist_node *dispatch_list;
u64 sum_diff = 0, diff, count = 0;
int ret;
dispatch_list = llist_del_all(&cprc->issue_list);
if (!dispatch_list)
return;
dispatch_list = llist_reverse_order(dispatch_list);
ret = __write_checkpoint_sync(sbi);
atomic_inc(&cprc->issued_ckpt);
llist_for_each_entry_safe(req, next, dispatch_list, llnode) {
diff = (u64)ktime_ms_delta(ktime_get(), req->queue_time);
req->ret = ret;
complete(&req->wait);
sum_diff += diff;
count++;
}
atomic_sub(count, &cprc->queued_ckpt);
atomic_add(count, &cprc->total_ckpt);
spin_lock(&cprc->stat_lock);
cprc->cur_time = (unsigned int)div64_u64(sum_diff, count);
if (cprc->peak_time < cprc->cur_time)
cprc->peak_time = cprc->cur_time;
spin_unlock(&cprc->stat_lock);
}
static int issue_checkpoint_thread(void *data)
{
struct f2fs_sb_info *sbi = data;
struct ckpt_req_control *cprc = &sbi->cprc_info;
wait_queue_head_t *q = &cprc->ckpt_wait_queue;
repeat:
if (kthread_should_stop())
return 0;
if (!llist_empty(&cprc->issue_list))
__checkpoint_and_complete_reqs(sbi);
wait_event_interruptible(*q,
kthread_should_stop() || !llist_empty(&cprc->issue_list));
goto repeat;
}
static void flush_remained_ckpt_reqs(struct f2fs_sb_info *sbi,
struct ckpt_req *wait_req)
{
struct ckpt_req_control *cprc = &sbi->cprc_info;
if (!llist_empty(&cprc->issue_list)) {
__checkpoint_and_complete_reqs(sbi);
} else {
/* already dispatched by issue_checkpoint_thread */
if (wait_req)
wait_for_completion(&wait_req->wait);
}
}
static void init_ckpt_req(struct ckpt_req *req)
{
memset(req, 0, sizeof(struct ckpt_req));
init_completion(&req->wait);
req->queue_time = ktime_get();
}
int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi)
{
struct ckpt_req_control *cprc = &sbi->cprc_info;
struct ckpt_req req;
struct cp_control cpc;
cpc.reason = __get_cp_reason(sbi);
if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) {
int ret;
f2fs_down_write(&sbi->gc_lock);
ret = f2fs_write_checkpoint(sbi, &cpc);
f2fs_up_write(&sbi->gc_lock);
return ret;
}
if (!cprc->f2fs_issue_ckpt)
return __write_checkpoint_sync(sbi);
init_ckpt_req(&req);
llist_add(&req.llnode, &cprc->issue_list);
atomic_inc(&cprc->queued_ckpt);
/*
* update issue_list before we wake up issue_checkpoint thread,
* this smp_mb() pairs with another barrier in ___wait_event(),
* see more details in comments of waitqueue_active().
*/
smp_mb();
if (waitqueue_active(&cprc->ckpt_wait_queue))
wake_up(&cprc->ckpt_wait_queue);
if (cprc->f2fs_issue_ckpt)
wait_for_completion(&req.wait);
else
flush_remained_ckpt_reqs(sbi, &req);
return req.ret;
}
int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi)
{
dev_t dev = sbi->sb->s_bdev->bd_dev;
struct ckpt_req_control *cprc = &sbi->cprc_info;
if (cprc->f2fs_issue_ckpt)
return 0;
cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi,
"f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(cprc->f2fs_issue_ckpt)) {
cprc->f2fs_issue_ckpt = NULL;
return -ENOMEM;
}
set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio);
return 0;
}
void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi)
{
struct ckpt_req_control *cprc = &sbi->cprc_info;
if (cprc->f2fs_issue_ckpt) {
struct task_struct *ckpt_task = cprc->f2fs_issue_ckpt;
cprc->f2fs_issue_ckpt = NULL;
kthread_stop(ckpt_task);
flush_remained_ckpt_reqs(sbi, NULL);
}
}
void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi)
{
struct ckpt_req_control *cprc = &sbi->cprc_info;
atomic_set(&cprc->issued_ckpt, 0);
atomic_set(&cprc->total_ckpt, 0);
atomic_set(&cprc->queued_ckpt, 0);
cprc->ckpt_thread_ioprio = DEFAULT_CHECKPOINT_IOPRIO;
init_waitqueue_head(&cprc->ckpt_wait_queue);
init_llist_head(&cprc->issue_list);
spin_lock_init(&cprc->stat_lock);
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -21,7 +21,7 @@
#include "gc.h"
static LIST_HEAD(f2fs_stat_list);
static DEFINE_MUTEX(f2fs_stat_mutex);
static DEFINE_RAW_SPINLOCK(f2fs_stat_lock);
#ifdef CONFIG_DEBUG_FS
static struct dentry *f2fs_debugfs_root;
#endif
@ -120,6 +120,13 @@ static void update_general_status(struct f2fs_sb_info *sbi)
atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt);
si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks;
}
si->nr_issued_ckpt = atomic_read(&sbi->cprc_info.issued_ckpt);
si->nr_total_ckpt = atomic_read(&sbi->cprc_info.total_ckpt);
si->nr_queued_ckpt = atomic_read(&sbi->cprc_info.queued_ckpt);
spin_lock(&sbi->cprc_info.stat_lock);
si->cur_ckpt_time = sbi->cprc_info.cur_time;
si->peak_ckpt_time = sbi->cprc_info.peak_time;
spin_unlock(&sbi->cprc_info.stat_lock);
si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
si->rsvd_segs = reserved_segments(sbi);
si->overp_segs = overprovision_segments(sbi);
@ -131,7 +138,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->inline_inode = atomic_read(&sbi->inline_inode);
si->inline_dir = atomic_read(&sbi->inline_dir);
si->compr_inode = atomic_read(&sbi->compr_inode);
si->compr_blocks = atomic_read(&sbi->compr_blocks);
si->compr_blocks = atomic64_read(&sbi->compr_blocks);
si->append = sbi->im[APPEND_INO].ino_num;
si->update = sbi->im[UPDATE_INO].ino_num;
si->orphans = sbi->im[ORPHAN_INO].ino_num;
@ -145,8 +152,14 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->node_pages = NODE_MAPPING(sbi)->nrpages;
if (sbi->meta_inode)
si->meta_pages = META_MAPPING(sbi)->nrpages;
si->nats = NM_I(sbi)->nat_cnt;
si->dirty_nats = NM_I(sbi)->dirty_nat_cnt;
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (sbi->compress_inode) {
si->compress_pages = COMPRESS_MAPPING(sbi)->nrpages;
si->compress_page_hit = atomic_read(&sbi->compress_page_hit);
}
#endif
si->nats = NM_I(sbi)->nat_cnt[TOTAL_NAT];
si->dirty_nats = NM_I(sbi)->nat_cnt[DIRTY_NAT];
si->sits = MAIN_SEGS(sbi);
si->dirty_sits = SIT_I(sbi)->dirty_sentries;
si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID];
@ -164,8 +177,9 @@ static void update_general_status(struct f2fs_sb_info *sbi)
* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
/ 2;
si->util_invalid = 50 - si->util_free - si->util_valid;
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) {
for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) {
struct curseg_info *curseg = CURSEG_I(sbi, i);
si->curseg[i] = curseg->segno;
si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno);
si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]);
@ -174,6 +188,26 @@ static void update_general_status(struct f2fs_sb_info *sbi)
for (i = META_CP; i < META_MAX; i++)
si->meta_count[i] = atomic_read(&sbi->meta_count[i]);
for (i = 0; i < NO_CHECK_TYPE; i++) {
si->dirty_seg[i] = 0;
si->full_seg[i] = 0;
si->valid_blks[i] = 0;
}
for (i = 0; i < MAIN_SEGS(sbi); i++) {
int blks = get_seg_entry(sbi, i)->valid_blocks;
int type = get_seg_entry(sbi, i)->type;
if (!blks)
continue;
if (blks == sbi->blocks_per_seg)
si->full_seg[type]++;
else
si->dirty_seg[type]++;
si->valid_blks[type] += blks;
}
for (i = 0; i < 2; i++) {
si->segment_count[i] = sbi->segment_count[i];
si->block_count[i] = sbi->block_count[i];
@ -258,9 +292,10 @@ get_cache:
si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID] +
NM_I(sbi)->nid_cnt[PREALLOC_NID]) *
sizeof(struct free_nid);
si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry);
si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
sizeof(struct nat_entry_set);
si->cache_mem += NM_I(sbi)->nat_cnt[TOTAL_NAT] *
sizeof(struct nat_entry);
si->cache_mem += NM_I(sbi)->nat_cnt[DIRTY_NAT] *
sizeof(struct nat_entry_set);
si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
for (i = 0; i < MAX_INO_ENTRY; i++)
si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
@ -272,21 +307,47 @@ get_cache:
si->page_mem = 0;
if (sbi->node_inode) {
unsigned npages = NODE_MAPPING(sbi)->nrpages;
si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
}
if (sbi->meta_inode) {
unsigned npages = META_MAPPING(sbi)->nrpages;
si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (sbi->compress_inode) {
unsigned npages = COMPRESS_MAPPING(sbi)->nrpages;
si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
}
#endif
}
static char *s_flag[] = {
[SBI_IS_DIRTY] = " fs_dirty",
[SBI_IS_CLOSE] = " closing",
[SBI_NEED_FSCK] = " need_fsck",
[SBI_POR_DOING] = " recovering",
[SBI_NEED_SB_WRITE] = " sb_dirty",
[SBI_NEED_CP] = " need_cp",
[SBI_IS_SHUTDOWN] = " shutdown",
[SBI_IS_RECOVERED] = " recovered",
[SBI_CP_DISABLED] = " cp_disabled",
[SBI_CP_DISABLED_QUICK] = " cp_disabled_quick",
[SBI_QUOTA_NEED_FLUSH] = " quota_need_flush",
[SBI_QUOTA_SKIP_FLUSH] = " quota_skip_flush",
[SBI_QUOTA_NEED_REPAIR] = " quota_need_repair",
[SBI_IS_RESIZEFS] = " resizefs",
[SBI_IS_FREEZING] = " freezefs",
};
static int stat_show(struct seq_file *s, void *v)
{
struct f2fs_stat_info *si;
int i = 0;
int j;
int i = 0, j = 0;
unsigned long flags;
mutex_lock(&f2fs_stat_mutex);
raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
list_for_each_entry(si, &f2fs_stat_list, stat_list) {
update_general_status(si->sbi);
@ -294,7 +355,13 @@ static int stat_show(struct seq_file *s, void *v)
si->sbi->sb->s_bdev, i++,
f2fs_readonly(si->sbi->sb) ? "RO": "RW",
is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ?
"Disabled": (f2fs_cp_error(si->sbi) ? "Error": "Good"));
"Disabled" : (f2fs_cp_error(si->sbi) ? "Error" : "Good"));
if (si->sbi->s_flag) {
seq_puts(s, "[SBI:");
for_each_set_bit(j, &si->sbi->s_flag, 32)
seq_puts(s, s_flag[j]);
seq_puts(s, "]\n");
}
seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ",
si->sit_area_segs, si->nat_area_segs);
seq_printf(s, "[SSA: %d] [MAIN: %d",
@ -322,37 +389,65 @@ static int stat_show(struct seq_file *s, void *v)
si->inline_inode);
seq_printf(s, " - Inline_dentry Inode: %u\n",
si->inline_dir);
seq_printf(s, " - Compressed Inode: %u, Blocks: %u\n",
seq_printf(s, " - Compressed Inode: %u, Blocks: %llu\n",
si->compr_inode, si->compr_blocks);
seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n",
si->orphans, si->append, si->update);
seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
si->main_area_segs, si->main_area_sections,
si->main_area_zones);
seq_printf(s, " - COLD data: %d, %d, %d\n",
seq_printf(s, " TYPE %8s %8s %8s %10s %10s %10s\n",
"segno", "secno", "zoneno", "dirty_seg", "full_seg", "valid_blk");
seq_printf(s, " - COLD data: %8d %8d %8d %10u %10u %10u\n",
si->curseg[CURSEG_COLD_DATA],
si->cursec[CURSEG_COLD_DATA],
si->curzone[CURSEG_COLD_DATA]);
seq_printf(s, " - WARM data: %d, %d, %d\n",
si->curzone[CURSEG_COLD_DATA],
si->dirty_seg[CURSEG_COLD_DATA],
si->full_seg[CURSEG_COLD_DATA],
si->valid_blks[CURSEG_COLD_DATA]);
seq_printf(s, " - WARM data: %8d %8d %8d %10u %10u %10u\n",
si->curseg[CURSEG_WARM_DATA],
si->cursec[CURSEG_WARM_DATA],
si->curzone[CURSEG_WARM_DATA]);
seq_printf(s, " - HOT data: %d, %d, %d\n",
si->curzone[CURSEG_WARM_DATA],
si->dirty_seg[CURSEG_WARM_DATA],
si->full_seg[CURSEG_WARM_DATA],
si->valid_blks[CURSEG_WARM_DATA]);
seq_printf(s, " - HOT data: %8d %8d %8d %10u %10u %10u\n",
si->curseg[CURSEG_HOT_DATA],
si->cursec[CURSEG_HOT_DATA],
si->curzone[CURSEG_HOT_DATA]);
seq_printf(s, " - Dir dnode: %d, %d, %d\n",
si->curzone[CURSEG_HOT_DATA],
si->dirty_seg[CURSEG_HOT_DATA],
si->full_seg[CURSEG_HOT_DATA],
si->valid_blks[CURSEG_HOT_DATA]);
seq_printf(s, " - Dir dnode: %8d %8d %8d %10u %10u %10u\n",
si->curseg[CURSEG_HOT_NODE],
si->cursec[CURSEG_HOT_NODE],
si->curzone[CURSEG_HOT_NODE]);
seq_printf(s, " - File dnode: %d, %d, %d\n",
si->curzone[CURSEG_HOT_NODE],
si->dirty_seg[CURSEG_HOT_NODE],
si->full_seg[CURSEG_HOT_NODE],
si->valid_blks[CURSEG_HOT_NODE]);
seq_printf(s, " - File dnode: %8d %8d %8d %10u %10u %10u\n",
si->curseg[CURSEG_WARM_NODE],
si->cursec[CURSEG_WARM_NODE],
si->curzone[CURSEG_WARM_NODE]);
seq_printf(s, " - Indir nodes: %d, %d, %d\n",
si->curzone[CURSEG_WARM_NODE],
si->dirty_seg[CURSEG_WARM_NODE],
si->full_seg[CURSEG_WARM_NODE],
si->valid_blks[CURSEG_WARM_NODE]);
seq_printf(s, " - Indir nodes: %8d %8d %8d %10u %10u %10u\n",
si->curseg[CURSEG_COLD_NODE],
si->cursec[CURSEG_COLD_NODE],
si->curzone[CURSEG_COLD_NODE]);
si->curzone[CURSEG_COLD_NODE],
si->dirty_seg[CURSEG_COLD_NODE],
si->full_seg[CURSEG_COLD_NODE],
si->valid_blks[CURSEG_COLD_NODE]);
seq_printf(s, " - Pinned file: %8d %8d %8d\n",
si->curseg[CURSEG_COLD_DATA_PINNED],
si->cursec[CURSEG_COLD_DATA_PINNED],
si->curzone[CURSEG_COLD_DATA_PINNED]);
seq_printf(s, " - ATGC data: %8d %8d %8d\n",
si->curseg[CURSEG_ALL_DATA_ATGC],
si->cursec[CURSEG_ALL_DATA_ATGC],
si->curzone[CURSEG_ALL_DATA_ATGC]);
seq_printf(s, "\n - Valid: %d\n - Dirty: %d\n",
si->main_area_segs - si->dirty_count -
si->prefree_count - si->free_segs,
@ -368,12 +463,28 @@ static int stat_show(struct seq_file *s, void *v)
si->meta_count[META_NAT]);
seq_printf(s, " - ssa blocks : %u\n",
si->meta_count[META_SSA]);
seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, "
"Cur time: %4d(ms), Peak time: %4d(ms))\n",
si->nr_queued_ckpt, si->nr_issued_ckpt,
si->nr_total_ckpt, si->cur_ckpt_time,
si->peak_ckpt_time);
seq_printf(s, "GC calls: %d (BG: %d)\n",
si->call_count, si->bg_gc);
seq_printf(s, " - data segments : %d (%d)\n",
si->data_segs, si->bg_data_segs);
seq_printf(s, " - node segments : %d (%d)\n",
si->node_segs, si->bg_node_segs);
seq_printf(s, " - Reclaimed segs : Normal (%d), Idle CB (%d), "
"Idle Greedy (%d), Idle AT (%d), "
"Urgent High (%d), Urgent Mid (%d), "
"Urgent Low (%d)\n",
si->sbi->gc_reclaimed_segs[GC_NORMAL],
si->sbi->gc_reclaimed_segs[GC_IDLE_CB],
si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY],
si->sbi->gc_reclaimed_segs[GC_IDLE_AT],
si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH],
si->sbi->gc_reclaimed_segs[GC_URGENT_MID],
si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]);
seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks,
si->bg_data_blks + si->bg_node_blks);
seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks,
@ -412,6 +523,7 @@ static int stat_show(struct seq_file *s, void *v)
"volatile IO: %4d (Max. %4d)\n",
si->inmem_pages, si->aw_cnt, si->max_aw_cnt,
si->vw_cnt, si->max_vw_cnt);
seq_printf(s, " - compress: %4d, hit:%8d\n", si->compress_pages, si->compress_page_hit);
seq_printf(s, " - nodes: %4d in %4d\n",
si->ndirty_node, si->node_pages);
seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n",
@ -424,6 +536,9 @@ static int stat_show(struct seq_file *s, void *v)
si->ndirty_meta, si->meta_pages);
seq_printf(s, " - imeta: %4d\n",
si->ndirty_imeta);
seq_printf(s, " - fsync mark: %4lld\n",
percpu_counter_sum_positive(
&si->sbi->rf_node_block_count));
seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n",
si->dirty_nats, si->nats, si->dirty_sits, si->sits);
seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n",
@ -465,7 +580,7 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, " - paged : %llu KB\n",
si->page_mem >> 10);
}
mutex_unlock(&f2fs_stat_mutex);
raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags);
return 0;
}
@ -487,6 +602,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
struct f2fs_stat_info *si;
unsigned long flags;
int i;
si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL);
@ -513,7 +629,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
atomic_set(&sbi->inline_inode, 0);
atomic_set(&sbi->inline_dir, 0);
atomic_set(&sbi->compr_inode, 0);
atomic_set(&sbi->compr_blocks, 0);
atomic64_set(&sbi->compr_blocks, 0);
atomic_set(&sbi->inplace_count, 0);
for (i = META_CP; i < META_MAX; i++)
atomic_set(&sbi->meta_count[i], 0);
@ -522,9 +638,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
atomic_set(&sbi->max_aw_cnt, 0);
atomic_set(&sbi->max_vw_cnt, 0);
mutex_lock(&f2fs_stat_mutex);
raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
list_add_tail(&si->stat_list, &f2fs_stat_list);
mutex_unlock(&f2fs_stat_mutex);
raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags);
return 0;
}
@ -532,12 +648,13 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
unsigned long flags;
mutex_lock(&f2fs_stat_mutex);
raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
list_del(&si->stat_list);
mutex_unlock(&f2fs_stat_mutex);
raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags);
kvfree(si);
kfree(si);
}
void __init f2fs_create_root_stats(void)
@ -545,7 +662,7 @@ void __init f2fs_create_root_stats(void)
#ifdef CONFIG_DEBUG_FS
f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);
debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL,
debugfs_create_file("status", 0444, f2fs_debugfs_root, NULL,
&stat_fops);
#endif
}

View File

@ -16,6 +16,10 @@
#include "xattr.h"
#include <trace/events/f2fs.h>
#ifdef CONFIG_UNICODE
extern struct kmem_cache *f2fs_cf_name_slab;
#endif
static unsigned long dir_blocks(struct inode *inode)
{
return ((unsigned long long) (i_size_read(inode) + PAGE_SIZE - 1))
@ -76,21 +80,21 @@ int f2fs_init_casefolded_name(const struct inode *dir,
struct f2fs_filename *fname)
{
#ifdef CONFIG_UNICODE
struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
struct super_block *sb = dir->i_sb;
if (IS_CASEFOLDED(dir)) {
fname->cf_name.name = f2fs_kmalloc(sbi, F2FS_NAME_LEN,
GFP_NOFS);
fname->cf_name.name = f2fs_kmem_cache_alloc(f2fs_cf_name_slab,
GFP_NOFS, false, F2FS_SB(sb));
if (!fname->cf_name.name)
return -ENOMEM;
fname->cf_name.len = utf8_casefold(sbi->sb->s_encoding,
fname->cf_name.len = utf8_casefold(sb->s_encoding,
fname->usr_fname,
fname->cf_name.name,
F2FS_NAME_LEN);
F2FS_NAME_LEN);s
if ((int)fname->cf_name.len <= 0) {
kfree(fname->cf_name.name);
kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name);
fname->cf_name.name = NULL;
if (sb_has_enc_strict_mode(dir->i_sb))
if (sb_has_strict_encoding(sb))
return -EINVAL;
/* fall back to treating name as opaque byte sequence */
}
@ -112,7 +116,7 @@ static int __f2fs_setup_filename(const struct inode *dir,
#ifdef CONFIG_FS_ENCRYPTION
fname->crypto_buf = crypt_name->crypto_buf;
#endif
if (crypt_name->is_ciphertext_name) {
if (crypt_name->is_nokey_name) {
/* hash was decoded from the no-key name */
fname->hash = cpu_to_le32(crypt_name->hash);
} else {
@ -171,8 +175,10 @@ void f2fs_free_filename(struct f2fs_filename *fname)
fname->crypto_buf.name = NULL;
#endif
#ifdef CONFIG_UNICODE
kfree(fname->cf_name.name);
fname->cf_name.name = NULL;
if (fname->cf_name.name) {
kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name);
fname->cf_name.name = NULL;
}
#endif
}
@ -191,29 +197,25 @@ static unsigned long dir_block_index(unsigned int level,
static struct f2fs_dir_entry *find_in_block(struct inode *dir,
struct page *dentry_page,
const struct f2fs_filename *fname,
int *max_slots,
struct page **res_page)
int *max_slots)
{
struct f2fs_dentry_block *dentry_blk;
struct f2fs_dir_entry *de;
struct f2fs_dentry_ptr d;
dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page);
make_dentry_ptr_block(dir, &d, dentry_blk);
de = f2fs_find_target_dentry(&d, fname, max_slots);
if (de)
*res_page = dentry_page;
return de;
return f2fs_find_target_dentry(&d, fname, max_slots);
}
#ifdef CONFIG_UNICODE
/*
* Test whether a case-insensitive directory entry matches the filename
* being searched for.
*
* Returns 1 for a match, 0 for no match, and -errno on an error.
*/
static bool f2fs_match_ci_name(const struct inode *dir, const struct qstr *name,
static int f2fs_match_ci_name(const struct inode *dir, const struct qstr *name,
const u8 *de_name, u32 de_name_len)
{
const struct super_block *sb = dir->i_sb;
@ -227,11 +229,11 @@ static bool f2fs_match_ci_name(const struct inode *dir, const struct qstr *name,
FSTR_INIT((u8 *)de_name, de_name_len);
if (WARN_ON_ONCE(!fscrypt_has_encryption_key(dir)))
return false;
return -EINVAL;
decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL);
if (!decrypted_name.name)
return false;
return -ENOMEM;
res = fscrypt_fname_disk_to_usr(dir, 0, 0, &encrypted_name,
&decrypted_name);
if (res < 0)
@ -241,23 +243,24 @@ static bool f2fs_match_ci_name(const struct inode *dir, const struct qstr *name,
}
res = utf8_strncasecmp_folded(um, name, &entry);
if (res < 0) {
/*
* In strict mode, ignore invalid names. In non-strict mode,
* fall back to treating them as opaque byte sequences.
*/
if (sb_has_enc_strict_mode(sb) || name->len != entry.len)
res = 1;
else
res = memcmp(name->name, entry.name, name->len);
/*
* In strict mode, ignore invalid names. In non-strict mode,
* fall back to treating them as opaque byte sequences.
*/
if (res < 0 && !sb_has_strict_encoding(sb)) {
res = name->len == entry.len &&
memcmp(name->name, entry.name, name->len) == 0;
} else {
/* utf8_strncasecmp_folded returns 0 on match */
res = (res == 0);
}
out:
kfree(decrypted_name.name);
return res == 0;
return res;
}
#endif /* CONFIG_UNICODE */
static inline bool f2fs_match_name(const struct inode *dir,
static inline int f2fs_match_name(const struct inode *dir,
const struct f2fs_filename *fname,
const u8 *de_name, u32 de_name_len)
{
@ -284,6 +287,7 @@ struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d,
struct f2fs_dir_entry *de;
unsigned long bit_pos = 0;
int max_len = 0;
int res = 0;
if (max_slots)
*max_slots = 0;
@ -301,10 +305,15 @@ struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d,
continue;
}
if (de->hash_code == fname->hash &&
f2fs_match_name(d->inode, fname, d->filename[bit_pos],
le16_to_cpu(de->name_len)))
goto found;
if (de->hash_code == fname->hash) {
res = f2fs_match_name(d->inode, fname,
d->filename[bit_pos],
le16_to_cpu(de->name_len));
if (res < 0)
return ERR_PTR(res);
if (res)
goto found;
}
if (max_slots && max_len > *max_slots)
*max_slots = max_len;
@ -353,10 +362,15 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
}
}
de = find_in_block(dir, dentry_page, fname, &max_slots,
res_page);
if (de)
de = find_in_block(dir, dentry_page, fname, &max_slots);
if (IS_ERR(de)) {
*res_page = ERR_CAST(de);
de = NULL;
break;
} else if (de) {
*res_page = dentry_page;
break;
}
if (max_slots >= s)
room = true;
@ -380,16 +394,15 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
unsigned int max_depth;
unsigned int level;
*res_page = NULL;
if (f2fs_has_inline_dentry(dir)) {
*res_page = NULL;
de = f2fs_find_in_inline_dir(dir, fname, res_page);
goto out;
}
if (npages == 0) {
*res_page = NULL;
if (npages == 0)
goto out;
}
max_depth = F2FS_I(dir)->i_current_depth;
if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) {
@ -400,7 +413,6 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
}
for (level = 0; level < max_depth; level++) {
*res_page = NULL;
de = find_in_level(dir, level, fname, res_page);
if (de || IS_ERR(*res_page))
break;
@ -466,6 +478,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
struct page *page, struct inode *inode)
{
enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA;
lock_page(page);
f2fs_wait_on_page_writeback(page, type, true, true);
de->ino = cpu_to_le32(inode->i_ino);
@ -755,7 +768,7 @@ add_dentry:
f2fs_wait_on_page_writeback(dentry_page, DATA, true, true);
if (inode) {
down_write(&F2FS_I(inode)->i_sem);
f2fs_down_write(&F2FS_I(inode)->i_sem);
page = f2fs_init_inode_metadata(inode, dir, fname, NULL);
if (IS_ERR(page)) {
err = PTR_ERR(page);
@ -782,7 +795,7 @@ add_dentry:
f2fs_update_parent_metadata(dir, inode, current_depth);
fail:
if (inode)
up_write(&F2FS_I(inode)->i_sem);
f2fs_up_write(&F2FS_I(inode)->i_sem);
f2fs_put_page(dentry_page, 1);
@ -820,7 +833,7 @@ int f2fs_do_add_link(struct inode *dir, const struct qstr *name,
return err;
/*
* An immature stakable filesystem shows a race condition between lookup
* An immature stackable filesystem shows a race condition between lookup
* and create. If we have same task when doing lookup and create, it's
* definitely fine as expected by VFS normally. Otherwise, let's just
* verify on-disk dentry one more time, which guarantees filesystem
@ -847,7 +860,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
struct page *page;
int err = 0;
down_write(&F2FS_I(inode)->i_sem);
f2fs_down_write(&F2FS_I(inode)->i_sem);
page = f2fs_init_inode_metadata(inode, dir, NULL, NULL);
if (IS_ERR(page)) {
err = PTR_ERR(page);
@ -858,7 +871,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
clear_inode_flag(inode, FI_NEW_INODE);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
fail:
up_write(&F2FS_I(inode)->i_sem);
f2fs_up_write(&F2FS_I(inode)->i_sem);
return err;
}
@ -866,7 +879,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
down_write(&F2FS_I(inode)->i_sem);
f2fs_down_write(&F2FS_I(inode)->i_sem);
if (S_ISDIR(inode->i_mode))
f2fs_i_links_write(dir, false);
@ -877,7 +890,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode)
f2fs_i_links_write(inode, false);
f2fs_i_size_write(inode, 0);
}
up_write(&F2FS_I(inode)->i_sem);
f2fs_up_write(&F2FS_I(inode)->i_sem);
if (inode->i_nlink == 0)
f2fs_add_orphan_inode(inode);
@ -923,11 +936,15 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
!f2fs_truncate_hole(dir, page->index, page->index + 1)) {
f2fs_clear_radix_tree_dirty_tag(page);
clear_page_dirty_for_io(page);
f2fs_clear_page_private(page);
ClearPageUptodate(page);
clear_cold_data(page);
clear_page_private_gcing(page);
inode_dec_dirty_pages(dir);
f2fs_remove_dirty_inode(dir);
detach_page_private(page);
set_page_private(page, 0);
}
f2fs_put_page(page, 1);
@ -985,6 +1002,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode);
struct blk_plug plug;
bool readdir_ra = sbi->readdir_ra == 1;
bool found_valid_dirent = false;
int err = 0;
bit_pos = ((unsigned long)ctx->pos % d->max);
@ -999,13 +1017,15 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
de = &d->dentry[bit_pos];
if (de->name_len == 0) {
if (found_valid_dirent || !bit_pos) {
printk_ratelimited(
"%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.",
KERN_WARNING, sbi->sb->s_id,
le32_to_cpu(de->ino));
set_sbi_flag(sbi, SBI_NEED_FSCK);
}
bit_pos++;
ctx->pos = start_pos + bit_pos;
printk_ratelimited(
"%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.",
KERN_WARNING, sbi->sb->s_id,
le32_to_cpu(de->ino));
set_sbi_flag(sbi, SBI_NEED_FSCK);
continue;
}
@ -1048,6 +1068,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
f2fs_ra_node_page(sbi, le32_to_cpu(de->ino));
ctx->pos = start_pos + bit_pos;
found_valid_dirent = true;
}
out:
if (readdir_ra)

View File

@ -58,6 +58,29 @@ struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root,
return re;
}
struct rb_node **f2fs_lookup_rb_tree_ext(struct f2fs_sb_info *sbi,
struct rb_root_cached *root,
struct rb_node **parent,
unsigned long long key, bool *leftmost)
{
struct rb_node **p = &root->rb_root.rb_node;
struct rb_entry *re;
while (*p) {
*parent = *p;
re = rb_entry(*parent, struct rb_entry, rb_node);
if (key < re->key) {
p = &(*p)->rb_left;
} else {
p = &(*p)->rb_right;
*leftmost = false;
}
}
return p;
}
struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi,
struct rb_root_cached *root,
struct rb_node **parent,
@ -166,7 +189,7 @@ lookup_neighbors:
}
bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi,
struct rb_root_cached *root)
struct rb_root_cached *root, bool check_key)
{
#ifdef CONFIG_F2FS_CHECK_FS
struct rb_node *cur = rb_first_cached(root), *next;
@ -183,13 +206,23 @@ bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi,
cur_re = rb_entry(cur, struct rb_entry, rb_node);
next_re = rb_entry(next, struct rb_entry, rb_node);
if (check_key) {
if (cur_re->key > next_re->key) {
f2fs_info(sbi, "inconsistent rbtree, "
"cur(%llu) next(%llu)",
cur_re->key, next_re->key);
return false;
}
goto next;
}
if (cur_re->ofs + cur_re->len > next_re->ofs) {
f2fs_info(sbi, "inconsistent rbtree, cur(%u, %u) next(%u, %u)",
cur_re->ofs, cur_re->len,
next_re->ofs, next_re->len);
return false;
}
next:
cur = next;
}
#endif
@ -206,7 +239,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
{
struct extent_node *en;
en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC);
en = f2fs_kmem_cache_alloc(extent_node_slab, GFP_ATOMIC, false, sbi);
if (!en)
return NULL;
@ -259,7 +292,8 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode)
mutex_lock(&sbi->extent_tree_lock);
et = radix_tree_lookup(&sbi->extent_tree_root, ino);
if (!et) {
et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS);
et = f2fs_kmem_cache_alloc(extent_tree_slab,
GFP_NOFS, true, NULL);
f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et);
memset(et, 0, sizeof(struct extent_tree));
et->ino = ino;
@ -325,9 +359,10 @@ static void __drop_largest_extent(struct extent_tree *et,
}
/* return true, if inode page is changed */
static bool __f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_extent *i_ext = ipage ? &F2FS_INODE(ipage)->i_ext : NULL;
struct extent_tree *et;
struct extent_node *en;
struct extent_info ei;
@ -335,16 +370,18 @@ static bool __f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_e
if (!f2fs_may_extent_tree(inode)) {
/* drop largest extent */
if (i_ext && i_ext->len) {
f2fs_wait_on_page_writeback(ipage, NODE, true, true);
i_ext->len = 0;
return true;
set_page_dirty(ipage);
return;
}
return false;
return;
}
et = __grab_extent_tree(inode);
if (!i_ext || !i_ext->len)
return false;
return;
get_extent_info(&ei, i_ext);
@ -360,17 +397,14 @@ static bool __f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_e
}
out:
write_unlock(&et->lock);
return false;
}
bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
void f2fs_init_extent_tree(struct inode *inode, struct page *ipage)
{
bool ret = __f2fs_init_extent_tree(inode, i_ext);
__f2fs_init_extent_tree(inode, ipage);
if (!F2FS_I(inode)->extent_tree)
set_inode_flag(inode, FI_NO_EXTENT);
return ret;
}
static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
@ -628,6 +662,47 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
f2fs_mark_inode_dirty_sync(inode, true);
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
void f2fs_update_extent_tree_range_compressed(struct inode *inode,
pgoff_t fofs, block_t blkaddr, unsigned int llen,
unsigned int c_len)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct extent_tree *et = F2FS_I(inode)->extent_tree;
struct extent_node *en = NULL;
struct extent_node *prev_en = NULL, *next_en = NULL;
struct extent_info ei;
struct rb_node **insert_p = NULL, *insert_parent = NULL;
bool leftmost = false;
trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen);
/* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */
if (is_inode_flag_set(inode, FI_NO_EXTENT))
return;
write_lock(&et->lock);
en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root,
(struct rb_entry *)et->cached_en, fofs,
(struct rb_entry **)&prev_en,
(struct rb_entry **)&next_en,
&insert_p, &insert_parent, false,
&leftmost);
if (en)
goto unlock_out;
set_extent_info(&ei, fofs, blkaddr, llen);
ei.c_len = c_len;
if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
__insert_extent_tree(sbi, et, &ei,
insert_p, insert_parent, leftmost);
unlock_out:
write_unlock(&et->lock);
}
#endif
unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
{
struct extent_tree *et, *next;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -14,6 +14,14 @@
#define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */
#define DEF_GC_THREAD_MAX_SLEEP_TIME 60000
#define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */
/* choose candidates from sections which has age of more than 7 days */
#define DEF_GC_THREAD_AGE_THRESHOLD (60 * 60 * 24 * 7)
#define DEF_GC_THREAD_CANDIDATE_RATIO 20 /* select 20% oldest sections as candidates */
#define DEF_GC_THREAD_MAX_CANDIDATE_COUNT 10 /* select at most 10 sections as candidates */
#define DEF_GC_THREAD_AGE_WEIGHT 60 /* age weight */
#define DEFAULT_ACCURACY_CLASS 10000 /* accuracy class */
#define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */
#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */
@ -34,6 +42,12 @@ struct f2fs_gc_kthread {
/* for changing gc mode */
unsigned int gc_wake;
/* for GC_MERGE mount option */
wait_queue_head_t fggc_wq; /*
* caller of f2fs_balance_fs()
* will wait on this wait queue.
*/
};
struct gc_inode_list {
@ -41,16 +55,69 @@ struct gc_inode_list {
struct radix_tree_root iroot;
};
struct victim_info {
unsigned long long mtime; /* mtime of section */
unsigned int segno; /* section No. */
};
struct victim_entry {
struct rb_node rb_node; /* rb node located in rb-tree */
union {
struct {
unsigned long long mtime; /* mtime of section */
unsigned int segno; /* segment No. */
};
struct victim_info vi; /* victim info */
};
struct list_head list;
};
/*
* inline functions
*/
/*
* On a Zoned device zone-capacity can be less than zone-size and if
* zone-capacity is not aligned to f2fs segment size(2MB), then the segment
* starting just before zone-capacity has some blocks spanning across the
* zone-capacity, these blocks are not usable.
* Such spanning segments can be in free list so calculate the sum of usable
* blocks in currently free segments including normal and spanning segments.
*/
static inline block_t free_segs_blk_count_zoned(struct f2fs_sb_info *sbi)
{
block_t free_seg_blks = 0;
struct free_segmap_info *free_i = FREE_I(sbi);
int j;
spin_lock(&free_i->segmap_lock);
for (j = 0; j < MAIN_SEGS(sbi); j++)
if (!test_bit(j, free_i->free_segmap))
free_seg_blks += f2fs_usable_blks_in_seg(sbi, j);
spin_unlock(&free_i->segmap_lock);
return free_seg_blks;
}
static inline block_t free_segs_blk_count(struct f2fs_sb_info *sbi)
{
if (f2fs_sb_has_blkzoned(sbi))
return free_segs_blk_count_zoned(sbi);
return free_segments(sbi) << sbi->log_blocks_per_seg;
}
static inline block_t free_user_blocks(struct f2fs_sb_info *sbi)
{
if (free_segments(sbi) < overprovision_segments(sbi))
block_t free_blks, ovp_blks;
free_blks = free_segs_blk_count(sbi);
ovp_blks = overprovision_segments(sbi) << sbi->log_blocks_per_seg;
if (free_blks < ovp_blks)
return 0;
else
return (free_segments(sbi) - overprovision_segments(sbi))
<< sbi->log_blocks_per_seg;
return free_blks - ovp_blks;
}
static inline block_t limit_invalid_user_blocks(struct f2fs_sb_info *sbi)

View File

@ -11,6 +11,7 @@
#include "f2fs.h"
#include "node.h"
#include <trace/events/f2fs.h>
bool f2fs_may_inline_data(struct inode *inode)
{
@ -129,7 +130,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
if (err)
return err;
err = f2fs_get_node_info(fio.sbi, dn->nid, &ni);
err = f2fs_get_node_info(fio.sbi, dn->nid, &ni, false);
if (err) {
f2fs_truncate_data_blocks_range(dn, 1);
f2fs_put_dnode(dn);
@ -171,7 +172,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
/* clear inline data and flag after data writeback */
f2fs_truncate_inline_inode(dn->inode, dn->inode_page, 0);
clear_inline_node(dn->inode_page);
clear_page_private_inline(dn->inode_page);
clear_out:
stat_dec_inline_inode(dn->inode);
clear_inode_flag(dn->inode, FI_INLINE_DATA);
@ -186,9 +187,14 @@ int f2fs_convert_inline_inode(struct inode *inode)
struct page *ipage, *page;
int err = 0;
if (!f2fs_has_inline_data(inode))
if (!f2fs_has_inline_data(inode) ||
f2fs_hw_is_readonly(sbi) || f2fs_readonly(sbi->sb))
return 0;
err = f2fs_dquot_initialize(inode);
if (err)
return err;
page = f2fs_grab_cache_page(inode->i_mapping, 0, false);
if (!page)
return -ENOMEM;
@ -248,12 +254,12 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
set_inode_flag(inode, FI_APPEND_WRITE);
set_inode_flag(inode, FI_DATA_EXIST);
clear_inline_node(dn.inode_page);
clear_page_private_inline(dn.inode_page);
f2fs_put_dnode(&dn);
return 0;
}
bool f2fs_recover_inline_data(struct inode *inode, struct page *npage)
int f2fs_recover_inline_data(struct inode *inode, struct page *npage)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode *ri = NULL;
@ -265,7 +271,7 @@ bool f2fs_recover_inline_data(struct inode *inode, struct page *npage)
* [prev.] [next] of inline_data flag
* o o -> recover inline_data
* o x -> remove inline_data, and then recover data blocks
* x o -> remove inline_data, and then recover inline_data
* x o -> remove data blocks, and then recover inline_data
* x x -> recover data blocks
*/
if (IS_INODE(npage))
@ -275,7 +281,8 @@ bool f2fs_recover_inline_data(struct inode *inode, struct page *npage)
ri && (ri->i_inline & F2FS_INLINE_DATA)) {
process_inline:
ipage = f2fs_get_node_page(sbi, inode->i_ino);
f2fs_bug_on(sbi, IS_ERR(ipage));
if (IS_ERR(ipage))
return PTR_ERR(ipage);
f2fs_wait_on_page_writeback(ipage, NODE, true, true);
@ -288,21 +295,27 @@ process_inline:
set_page_dirty(ipage);
f2fs_put_page(ipage, 1);
return true;
return 1;
}
if (f2fs_has_inline_data(inode)) {
ipage = f2fs_get_node_page(sbi, inode->i_ino);
f2fs_bug_on(sbi, IS_ERR(ipage));
if (IS_ERR(ipage))
return PTR_ERR(ipage);
f2fs_truncate_inline_inode(inode, ipage, 0);
stat_dec_inline_inode(inode);
clear_inode_flag(inode, FI_INLINE_DATA);
f2fs_put_page(ipage, 1);
} else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
if (f2fs_truncate_blocks(inode, 0, false))
return false;
int ret;
ret = f2fs_truncate_blocks(inode, 0, false);
if (ret)
return ret;
stat_inc_inline_inode(inode);
goto process_inline;
}
return false;
return 0;
}
struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir,
@ -326,6 +339,10 @@ struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir,
make_dentry_ptr_inline(dir, &d, inline_dentry);
de = f2fs_find_target_dentry(&d, fname, NULL);
unlock_page(ipage);
if (IS_ERR(de)) {
*res_page = ERR_CAST(de);
de = NULL;
}
if (de)
*res_page = ipage;
else
@ -518,7 +535,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
!f2fs_has_inline_xattr(dir))
F2FS_I(dir)->i_inline_xattr_size = 0;
kvfree(backup_dentry);
kfree(backup_dentry);
return 0;
recover:
lock_page(ipage);
@ -529,7 +546,7 @@ recover:
set_page_dirty(ipage);
f2fs_put_page(ipage, 1);
kvfree(backup_dentry);
kfree(backup_dentry);
return err;
}
@ -611,7 +628,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname,
}
if (inode) {
down_write(&F2FS_I(inode)->i_sem);
f2fs_down_write(&F2FS_I(inode)->i_sem);
page = f2fs_init_inode_metadata(inode, dir, fname, ipage);
if (IS_ERR(page)) {
err = PTR_ERR(page);
@ -640,7 +657,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname,
f2fs_update_parent_metadata(dir, inode, 0);
fail:
if (inode)
up_write(&F2FS_I(inode)->i_sem);
f2fs_up_write(&F2FS_I(inode)->i_sem);
out:
f2fs_put_page(ipage, 1);
return err;
@ -768,7 +785,7 @@ int f2fs_inline_data_fiemap(struct inode *inode,
ilen = start + len;
ilen -= start;
err = f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni);
err = f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni, false);
if (err)
goto out;
@ -776,6 +793,7 @@ int f2fs_inline_data_fiemap(struct inode *inode,
byteaddr += (char *)inline_data_addr(inode, ipage) -
(char *)F2FS_INODE(ipage);
err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags);
trace_f2fs_fiemap(inode, start, byteaddr, ilen, flags, err);
out:
f2fs_put_page(ipage, 1);
return err;

View File

@ -18,6 +18,10 @@
#include <trace/events/f2fs.h>
#ifdef CONFIG_F2FS_FS_COMPRESSION
extern const struct address_space_operations f2fs_compress_aops;
#endif
void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync)
{
if (is_inode_flag_set(inode, FI_NEW_INODE))
@ -287,11 +291,19 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
return false;
}
if ((fi->i_flags & F2FS_CASEFOLD_FL) && !f2fs_sb_has_casefold(sbi)) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: inode (ino=%lx) has casefold flag, but casefold feature is off",
__func__, inode->i_ino);
return false;
}
if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi) &&
fi->i_flags & F2FS_COMPR_FL &&
F2FS_FITS_IN_INODE(ri, fi->i_extra_isize,
i_log_cluster_size)) {
if (ri->i_compress_algorithm >= COMPRESS_MAX) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported "
"compress algorithm: %u, run fsck to fix",
__func__, inode->i_ino,
@ -300,6 +312,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
}
if (le64_to_cpu(ri->i_compr_blocks) >
SECTOR_TO_BLOCK(inode->i_blocks)) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: inode (ino=%lx) has inconsistent "
"i_compr_blocks:%llu, i_blocks:%lu, run fsck to fix",
__func__, inode->i_ino,
@ -309,6 +322,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
}
if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE ||
ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported "
"log cluster size: %u, run fsck to fix",
__func__, inode->i_ino,
@ -367,8 +381,7 @@ static int do_read_inode(struct inode *inode)
fi->i_pino = le32_to_cpu(ri->i_pino);
fi->i_dir_level = ri->i_dir_level;
if (f2fs_init_extent_tree(inode, &ri->i_ext))
set_page_dirty(node_page);
f2fs_init_extent_tree(inode, node_page);
get_inline_info(inode, ri);
@ -402,6 +415,7 @@ static int do_read_inode(struct inode *inode)
/* try to recover cold bit for non-dir inode */
if (!S_ISDIR(inode->i_mode) && !is_cold_node(node_page)) {
f2fs_wait_on_page_writeback(node_page, NODE, true, true);
set_cold_node(node_page, false);
set_page_dirty(node_page);
}
@ -442,9 +456,11 @@ static int do_read_inode(struct inode *inode)
(fi->i_flags & F2FS_COMPR_FL)) {
if (F2FS_FITS_IN_INODE(ri, fi->i_extra_isize,
i_log_cluster_size)) {
fi->i_compr_blocks = le64_to_cpu(ri->i_compr_blocks);
atomic_set(&fi->i_compr_blocks,
le64_to_cpu(ri->i_compr_blocks));
fi->i_compress_algorithm = ri->i_compress_algorithm;
fi->i_log_cluster_size = ri->i_log_cluster_size;
fi->i_compress_flag = le16_to_cpu(ri->i_compress_flag);
fi->i_cluster_size = 1 << fi->i_log_cluster_size;
set_inode_flag(inode, FI_COMPRESSED_FILE);
}
@ -460,7 +476,7 @@ static int do_read_inode(struct inode *inode)
stat_inc_inline_inode(inode);
stat_inc_inline_dir(inode);
stat_inc_compr_inode(inode);
stat_add_compr_blocks(inode, F2FS_I(inode)->i_compr_blocks);
stat_add_compr_blocks(inode, atomic_read(&fi->i_compr_blocks));
return 0;
}
@ -482,6 +498,11 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi))
goto make_now;
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (ino == F2FS_COMPRESS_INO(sbi))
goto make_now;
#endif
ret = do_read_inode(inode);
if (ret)
goto bad_inode;
@ -492,6 +513,17 @@ make_now:
} else if (ino == F2FS_META_INO(sbi)) {
inode->i_mapping->a_ops = &f2fs_meta_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
} else if (ino == F2FS_COMPRESS_INO(sbi)) {
#ifdef CONFIG_F2FS_FS_COMPRESSION
inode->i_mapping->a_ops = &f2fs_compress_aops;
/*
* generic_error_remove_page only truncates pages of regular
* inode
*/
inode->i_mode |= S_IFREG;
#endif
mapping_set_gfp_mask(inode->i_mapping,
GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE);
} else if (S_ISREG(inode->i_mode)) {
inode->i_op = &f2fs_file_inode_operations;
inode->i_fop = &f2fs_file_operations;
@ -500,7 +532,7 @@ make_now:
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
inode_nohighmem(inode);
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
} else if (S_ISLNK(inode->i_mode)) {
if (file_is_encrypt(inode))
inode->i_op = &f2fs_encrypted_symlink_inode_operations;
@ -517,6 +549,14 @@ make_now:
goto bad_inode;
}
f2fs_set_inode_flags(inode);
if (file_should_truncate(inode)) {
ret = f2fs_truncate(inode);
if (ret)
goto bad_inode;
file_dont_truncate(inode);
}
unlock_new_inode(inode);
trace_f2fs_iget(inode);
return inode;
@ -619,9 +659,12 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
i_log_cluster_size)) {
ri->i_compr_blocks =
cpu_to_le64(F2FS_I(inode)->i_compr_blocks);
cpu_to_le64(atomic_read(
&F2FS_I(inode)->i_compr_blocks));
ri->i_compress_algorithm =
F2FS_I(inode)->i_compress_algorithm;
ri->i_compress_flag =
cpu_to_le16(F2FS_I(inode)->i_compress_flag);
ri->i_log_cluster_size =
F2FS_I(inode)->i_log_cluster_size;
}
@ -631,7 +674,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
/* deleted inode */
if (inode->i_nlink == 0)
clear_inline_node(node_page);
clear_page_private_inline(node_page);
F2FS_I(inode)->i_disk_time[0] = inode->i_atime;
F2FS_I(inode)->i_disk_time[1] = inode->i_ctime;
@ -651,6 +694,7 @@ retry:
node_page = f2fs_get_node_page(sbi, inode->i_ino);
if (IS_ERR(node_page)) {
int err = PTR_ERR(node_page);
if (err == -ENOMEM) {
cond_resched();
goto retry;
@ -683,7 +727,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
/*
* We need to balance fs here to prevent from producing dirty node pages
* during the urgent cleaning time when runing out of free sections.
* during the urgent cleaning time when running out of free sections.
*/
f2fs_update_inode_page(inode);
if (wbc && wbc->nr_to_write)
@ -707,8 +751,13 @@ void f2fs_evict_inode(struct inode *inode)
trace_f2fs_evict_inode(inode);
truncate_inode_pages_final(&inode->i_data);
if ((inode->i_nlink || is_bad_inode(inode)) &&
test_opt(sbi, COMPRESS_CACHE) && f2fs_compressed_file(inode))
f2fs_invalidate_compress_pages(sbi, inode->i_ino);
if (inode->i_ino == F2FS_NODE_INO(sbi) ||
inode->i_ino == F2FS_META_INO(sbi))
inode->i_ino == F2FS_META_INO(sbi) ||
inode->i_ino == F2FS_COMPRESS_INO(sbi))
goto out_clear;
f2fs_bug_on(sbi, get_dirty_pages(inode));
@ -719,7 +768,7 @@ void f2fs_evict_inode(struct inode *inode)
if (inode->i_nlink || is_bad_inode(inode))
goto no_delete;
err = dquot_initialize(inode);
err = f2fs_dquot_initialize(inode);
if (err) {
err = 0;
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
@ -729,7 +778,8 @@ void f2fs_evict_inode(struct inode *inode)
f2fs_remove_ino_entry(sbi, inode->i_ino, UPDATE_INO);
f2fs_remove_ino_entry(sbi, inode->i_ino, FLUSH_INO);
sb_start_intwrite(inode->i_sb);
if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING))
sb_start_intwrite(inode->i_sb);
set_inode_flag(inode, FI_NO_ALLOC);
i_size_write(inode, 0);
retry:
@ -760,7 +810,8 @@ retry:
if (dquot_initialize_needed(inode))
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
}
sb_end_intwrite(inode->i_sb);
if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING))
sb_end_intwrite(inode->i_sb);
no_delete:
dquot_drop(inode);
@ -768,7 +819,8 @@ no_delete:
stat_dec_inline_dir(inode);
stat_dec_inline_inode(inode);
stat_dec_compr_inode(inode);
stat_sub_compr_blocks(inode, F2FS_I(inode)->i_compr_blocks);
stat_sub_compr_blocks(inode,
atomic_read(&F2FS_I(inode)->i_compr_blocks));
if (unlikely(is_inode_flag_set(inode, FI_DIRTY_INODE))) {
f2fs_inode_synced(inode);
@ -835,9 +887,10 @@ void f2fs_handle_failed_inode(struct inode *inode)
* so we can prevent losing this orphan when encoutering checkpoint
* and following suddenly power-off.
*/
err = f2fs_get_node_info(sbi, inode->i_ino, &ni);
err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false);
if (err) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
set_inode_flag(inode, FI_FREE_NID);
f2fs_warn(sbi, "May loss orphan inode, run fsck to fix.");
goto out;
}

287
fs/f2fs/iostat.c Normal file
View File

@ -0,0 +1,287 @@
// SPDX-License-Identifier: GPL-2.0
/*
* f2fs iostat support
*
* Copyright 2021 Google LLC
* Author: Daeho Jeong <daehojeong@google.com>
*/
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
#include <linux/seq_file.h>
#include "f2fs.h"
#include "iostat.h"
#include <trace/events/f2fs.h>
#define NUM_PREALLOC_IOSTAT_CTXS 128
static struct kmem_cache *bio_iostat_ctx_cache;
static mempool_t *bio_iostat_ctx_pool;
int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset)
{
struct super_block *sb = seq->private;
struct f2fs_sb_info *sbi = F2FS_SB(sb);
time64_t now = ktime_get_real_seconds();
if (!sbi->iostat_enable)
return 0;
seq_printf(seq, "time: %-16llu\n", now);
/* print app write IOs */
seq_puts(seq, "[WRITE]\n");
seq_printf(seq, "app buffered: %-16llu\n",
sbi->rw_iostat[APP_BUFFERED_IO]);
seq_printf(seq, "app direct: %-16llu\n",
sbi->rw_iostat[APP_DIRECT_IO]);
seq_printf(seq, "app mapped: %-16llu\n",
sbi->rw_iostat[APP_MAPPED_IO]);
/* print fs write IOs */
seq_printf(seq, "fs data: %-16llu\n",
sbi->rw_iostat[FS_DATA_IO]);
seq_printf(seq, "fs node: %-16llu\n",
sbi->rw_iostat[FS_NODE_IO]);
seq_printf(seq, "fs meta: %-16llu\n",
sbi->rw_iostat[FS_META_IO]);
seq_printf(seq, "fs gc data: %-16llu\n",
sbi->rw_iostat[FS_GC_DATA_IO]);
seq_printf(seq, "fs gc node: %-16llu\n",
sbi->rw_iostat[FS_GC_NODE_IO]);
seq_printf(seq, "fs cp data: %-16llu\n",
sbi->rw_iostat[FS_CP_DATA_IO]);
seq_printf(seq, "fs cp node: %-16llu\n",
sbi->rw_iostat[FS_CP_NODE_IO]);
seq_printf(seq, "fs cp meta: %-16llu\n",
sbi->rw_iostat[FS_CP_META_IO]);
/* print app read IOs */
seq_puts(seq, "[READ]\n");
seq_printf(seq, "app buffered: %-16llu\n",
sbi->rw_iostat[APP_BUFFERED_READ_IO]);
seq_printf(seq, "app direct: %-16llu\n",
sbi->rw_iostat[APP_DIRECT_READ_IO]);
seq_printf(seq, "app mapped: %-16llu\n",
sbi->rw_iostat[APP_MAPPED_READ_IO]);
/* print fs read IOs */
seq_printf(seq, "fs data: %-16llu\n",
sbi->rw_iostat[FS_DATA_READ_IO]);
seq_printf(seq, "fs gc data: %-16llu\n",
sbi->rw_iostat[FS_GDATA_READ_IO]);
seq_printf(seq, "fs compr_data: %-16llu\n",
sbi->rw_iostat[FS_CDATA_READ_IO]);
seq_printf(seq, "fs node: %-16llu\n",
sbi->rw_iostat[FS_NODE_READ_IO]);
seq_printf(seq, "fs meta: %-16llu\n",
sbi->rw_iostat[FS_META_READ_IO]);
/* print other IOs */
seq_puts(seq, "[OTHER]\n");
seq_printf(seq, "fs discard: %-16llu\n",
sbi->rw_iostat[FS_DISCARD]);
return 0;
}
static inline void __record_iostat_latency(struct f2fs_sb_info *sbi)
{
int io, idx = 0;
unsigned int cnt;
struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE];
struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
spin_lock_irq(&sbi->iostat_lat_lock);
for (idx = 0; idx < MAX_IO_TYPE; idx++) {
for (io = 0; io < NR_PAGE_TYPE; io++) {
cnt = io_lat->bio_cnt[idx][io];
iostat_lat[idx][io].peak_lat =
jiffies_to_msecs(io_lat->peak_lat[idx][io]);
iostat_lat[idx][io].cnt = cnt;
iostat_lat[idx][io].avg_lat = cnt ?
jiffies_to_msecs(io_lat->sum_lat[idx][io]) / cnt : 0;
io_lat->sum_lat[idx][io] = 0;
io_lat->peak_lat[idx][io] = 0;
io_lat->bio_cnt[idx][io] = 0;
}
}
spin_unlock_irq(&sbi->iostat_lat_lock);
trace_f2fs_iostat_latency(sbi, iostat_lat);
}
static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi)
{
unsigned long long iostat_diff[NR_IO_TYPE];
int i;
if (time_is_after_jiffies(sbi->iostat_next_period))
return;
/* Need double check under the lock */
spin_lock(&sbi->iostat_lock);
if (time_is_after_jiffies(sbi->iostat_next_period)) {
spin_unlock(&sbi->iostat_lock);
return;
}
sbi->iostat_next_period = jiffies +
msecs_to_jiffies(sbi->iostat_period_ms);
for (i = 0; i < NR_IO_TYPE; i++) {
iostat_diff[i] = sbi->rw_iostat[i] -
sbi->prev_rw_iostat[i];
sbi->prev_rw_iostat[i] = sbi->rw_iostat[i];
}
spin_unlock(&sbi->iostat_lock);
trace_f2fs_iostat(sbi, iostat_diff);
__record_iostat_latency(sbi);
}
void f2fs_reset_iostat(struct f2fs_sb_info *sbi)
{
struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
int i;
spin_lock(&sbi->iostat_lock);
for (i = 0; i < NR_IO_TYPE; i++) {
sbi->rw_iostat[i] = 0;
sbi->prev_rw_iostat[i] = 0;
}
spin_unlock(&sbi->iostat_lock);
spin_lock_irq(&sbi->iostat_lat_lock);
memset(io_lat, 0, sizeof(struct iostat_lat_info));
spin_unlock_irq(&sbi->iostat_lat_lock);
}
void f2fs_update_iostat(struct f2fs_sb_info *sbi,
enum iostat_type type, unsigned long long io_bytes)
{
if (!sbi->iostat_enable)
return;
spin_lock(&sbi->iostat_lock);
sbi->rw_iostat[type] += io_bytes;
if (type == APP_WRITE_IO || type == APP_DIRECT_IO)
sbi->rw_iostat[APP_BUFFERED_IO] =
sbi->rw_iostat[APP_WRITE_IO] -
sbi->rw_iostat[APP_DIRECT_IO];
if (type == APP_READ_IO || type == APP_DIRECT_READ_IO)
sbi->rw_iostat[APP_BUFFERED_READ_IO] =
sbi->rw_iostat[APP_READ_IO] -
sbi->rw_iostat[APP_DIRECT_READ_IO];
spin_unlock(&sbi->iostat_lock);
f2fs_record_iostat(sbi);
}
static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx,
int rw, bool is_sync)
{
unsigned long ts_diff;
unsigned int iotype = iostat_ctx->type;
unsigned long flags;
struct f2fs_sb_info *sbi = iostat_ctx->sbi;
struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
int idx;
if (!sbi->iostat_enable)
return;
ts_diff = jiffies - iostat_ctx->submit_ts;
if (iotype >= META_FLUSH)
iotype = META;
if (rw == 0) {
idx = READ_IO;
} else {
if (is_sync)
idx = WRITE_SYNC_IO;
else
idx = WRITE_ASYNC_IO;
}
spin_lock_irqsave(&sbi->iostat_lat_lock, flags);
io_lat->sum_lat[idx][iotype] += ts_diff;
io_lat->bio_cnt[idx][iotype]++;
if (ts_diff > io_lat->peak_lat[idx][iotype])
io_lat->peak_lat[idx][iotype] = ts_diff;
spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags);
}
void iostat_update_and_unbind_ctx(struct bio *bio, int rw)
{
struct bio_iostat_ctx *iostat_ctx = bio->bi_private;
bool is_sync = bio->bi_opf & REQ_SYNC;
if (rw == 0)
bio->bi_private = iostat_ctx->post_read_ctx;
else
bio->bi_private = iostat_ctx->sbi;
__update_iostat_latency(iostat_ctx, rw, is_sync);
mempool_free(iostat_ctx, bio_iostat_ctx_pool);
}
void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi,
struct bio *bio, struct bio_post_read_ctx *ctx)
{
struct bio_iostat_ctx *iostat_ctx;
/* Due to the mempool, this never fails. */
iostat_ctx = mempool_alloc(bio_iostat_ctx_pool, GFP_NOFS);
iostat_ctx->sbi = sbi;
iostat_ctx->submit_ts = 0;
iostat_ctx->type = 0;
iostat_ctx->post_read_ctx = ctx;
bio->bi_private = iostat_ctx;
}
int __init f2fs_init_iostat_processing(void)
{
bio_iostat_ctx_cache =
kmem_cache_create("f2fs_bio_iostat_ctx",
sizeof(struct bio_iostat_ctx), 0, 0, NULL);
if (!bio_iostat_ctx_cache)
goto fail;
bio_iostat_ctx_pool =
mempool_create_slab_pool(NUM_PREALLOC_IOSTAT_CTXS,
bio_iostat_ctx_cache);
if (!bio_iostat_ctx_pool)
goto fail_free_cache;
return 0;
fail_free_cache:
kmem_cache_destroy(bio_iostat_ctx_cache);
fail:
return -ENOMEM;
}
void f2fs_destroy_iostat_processing(void)
{
mempool_destroy(bio_iostat_ctx_pool);
kmem_cache_destroy(bio_iostat_ctx_cache);
}
int f2fs_init_iostat(struct f2fs_sb_info *sbi)
{
/* init iostat info */
spin_lock_init(&sbi->iostat_lock);
spin_lock_init(&sbi->iostat_lat_lock);
sbi->iostat_enable = false;
sbi->iostat_period_ms = DEFAULT_IOSTAT_PERIOD_MS;
sbi->iostat_io_lat = f2fs_kzalloc(sbi, sizeof(struct iostat_lat_info),
GFP_KERNEL);
if (!sbi->iostat_io_lat)
return -ENOMEM;
return 0;
}
void f2fs_destroy_iostat(struct f2fs_sb_info *sbi)
{
kfree(sbi->iostat_io_lat);
}

84
fs/f2fs/iostat.h Normal file
View File

@ -0,0 +1,84 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright 2021 Google LLC
* Author: Daeho Jeong <daehojeong@google.com>
*/
#ifndef __F2FS_IOSTAT_H__
#define __F2FS_IOSTAT_H__
struct bio_post_read_ctx;
#ifdef CONFIG_F2FS_IOSTAT
#define DEFAULT_IOSTAT_PERIOD_MS 3000
#define MIN_IOSTAT_PERIOD_MS 100
/* maximum period of iostat tracing is 1 day */
#define MAX_IOSTAT_PERIOD_MS 8640000
enum {
READ_IO,
WRITE_SYNC_IO,
WRITE_ASYNC_IO,
MAX_IO_TYPE,
};
struct iostat_lat_info {
unsigned long sum_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; /* sum of io latencies */
unsigned long peak_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; /* peak io latency */
unsigned int bio_cnt[MAX_IO_TYPE][NR_PAGE_TYPE]; /* bio count */
};
extern int __maybe_unused iostat_info_seq_show(struct seq_file *seq,
void *offset);
extern void f2fs_reset_iostat(struct f2fs_sb_info *sbi);
extern void f2fs_update_iostat(struct f2fs_sb_info *sbi,
enum iostat_type type, unsigned long long io_bytes);
struct bio_iostat_ctx {
struct f2fs_sb_info *sbi;
unsigned long submit_ts;
enum page_type type;
struct bio_post_read_ctx *post_read_ctx;
};
static inline void iostat_update_submit_ctx(struct bio *bio,
enum page_type type)
{
struct bio_iostat_ctx *iostat_ctx = bio->bi_private;
iostat_ctx->submit_ts = jiffies;
iostat_ctx->type = type;
}
static inline struct bio_post_read_ctx *get_post_read_ctx(struct bio *bio)
{
struct bio_iostat_ctx *iostat_ctx = bio->bi_private;
return iostat_ctx->post_read_ctx;
}
extern void iostat_update_and_unbind_ctx(struct bio *bio, int rw);
extern void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi,
struct bio *bio, struct bio_post_read_ctx *ctx);
extern int f2fs_init_iostat_processing(void);
extern void f2fs_destroy_iostat_processing(void);
extern int f2fs_init_iostat(struct f2fs_sb_info *sbi);
extern void f2fs_destroy_iostat(struct f2fs_sb_info *sbi);
#else
static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi,
enum iostat_type type, unsigned long long io_bytes) {}
static inline void iostat_update_and_unbind_ctx(struct bio *bio, int rw) {}
static inline void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi,
struct bio *bio, struct bio_post_read_ctx *ctx) {}
static inline void iostat_update_submit_ctx(struct bio *bio,
enum page_type type) {}
static inline struct bio_post_read_ctx *get_post_read_ctx(struct bio *bio)
{
return bio->bi_private;
}
static inline int f2fs_init_iostat_processing(void) { return 0; }
static inline void f2fs_destroy_iostat_processing(void) {}
static inline int f2fs_init_iostat(struct f2fs_sb_info *sbi) { return 0; }
static inline void f2fs_destroy_iostat(struct f2fs_sb_info *sbi) {}
#endif
#endif /* __F2FS_IOSTAT_H__ */

View File

@ -69,7 +69,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns,
F2FS_DEF_PROJID);
err = dquot_initialize(inode);
err = f2fs_dquot_initialize(inode);
if (err)
goto fail_drop;
@ -148,7 +148,8 @@ fail_drop:
return ERR_PTR(err);
}
static inline int is_extension_exist(const unsigned char *s, const char *sub)
static inline int is_extension_exist(const unsigned char *s, const char *sub,
bool tmp_ext)
{
size_t slen = strlen(s);
size_t sublen = strlen(sub);
@ -164,6 +165,13 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub)
if (slen < sublen + 2)
return 0;
if (!tmp_ext) {
/* file has no temp extension */
if (s[slen - sublen - 1] != '.')
return 0;
return !strncasecmp(s + slen - sublen, sub, sublen);
}
for (i = 1; i < slen - sublen; i++) {
if (s[i] != '.')
continue;
@ -183,17 +191,17 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *
__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
int i, cold_count, hot_count;
down_read(&sbi->sb_lock);
f2fs_down_read(&sbi->sb_lock);
cold_count = le32_to_cpu(sbi->raw_super->extension_count);
hot_count = sbi->raw_super->hot_ext_count;
for (i = 0; i < cold_count + hot_count; i++) {
if (is_extension_exist(name, extlist[i]))
if (is_extension_exist(name, extlist[i], true))
break;
}
up_read(&sbi->sb_lock);
f2fs_up_read(&sbi->sb_lock);
if (i == cold_count + hot_count)
return;
@ -274,34 +282,44 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
const unsigned char *name)
{
__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
unsigned char (*ext)[F2FS_EXTENSION_LEN];
unsigned int ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
unsigned char (*noext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).noextensions;
unsigned char (*ext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).extensions;
unsigned char ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
unsigned char noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt;
int i, cold_count, hot_count;
if (!f2fs_sb_has_compression(sbi) ||
is_inode_flag_set(inode, FI_COMPRESSED_FILE) ||
F2FS_I(inode)->i_flags & F2FS_NOCOMP_FL ||
!f2fs_may_compress(inode))
!f2fs_may_compress(inode) ||
(!ext_cnt && !noext_cnt))
return;
down_read(&sbi->sb_lock);
f2fs_down_read(&sbi->sb_lock);
cold_count = le32_to_cpu(sbi->raw_super->extension_count);
hot_count = sbi->raw_super->hot_ext_count;
for (i = cold_count; i < cold_count + hot_count; i++) {
if (is_extension_exist(name, extlist[i])) {
up_read(&sbi->sb_lock);
if (is_extension_exist(name, extlist[i], false)) {
f2fs_up_read(&sbi->sb_lock);
return;
}
}
up_read(&sbi->sb_lock);
f2fs_up_read(&sbi->sb_lock);
ext = F2FS_OPTION(sbi).extensions;
for (i = 0; i < noext_cnt; i++) {
if (is_extension_exist(name, noext[i], false)) {
f2fs_disable_compressed_file(inode);
return;
}
}
if (is_inode_flag_set(inode, FI_COMPRESSED_FILE))
return;
for (i = 0; i < ext_cnt; i++) {
if (!is_extension_exist(name, ext[i]))
if (!is_extension_exist(name, ext[i], false))
continue;
set_compress_context(inode);
@ -322,7 +340,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
if (!f2fs_is_checkpoint_ready(sbi))
return -ENOSPC;
err = dquot_initialize(dir);
err = f2fs_dquot_initialize(dir);
if (err)
return err;
@ -381,7 +399,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
F2FS_I(old_dentry->d_inode)->i_projid)))
return -EXDEV;
err = dquot_initialize(dir);
err = f2fs_dquot_initialize(dir);
if (err)
return err;
@ -414,6 +432,7 @@ struct dentry *f2fs_get_parent(struct dentry *child)
struct qstr dotdot = QSTR_INIT("..", 2);
struct page *page;
unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot, &page);
if (!ino) {
if (IS_ERR(page))
return ERR_CAST(page);
@ -437,7 +456,7 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
return 0;
}
err = dquot_initialize(dir);
err = f2fs_dquot_initialize(dir);
if (err)
return err;
@ -492,7 +511,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
}
err = f2fs_prepare_lookup(dir, dentry, &fname);
generic_set_encrypted_ci_d_ops(dir, dentry);
generic_set_encrypted_ci_d_ops(dentry);
if (err == -ENOENT)
goto out_splice;
if (err)
@ -570,15 +589,17 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
trace_f2fs_unlink_enter(dir, dentry);
if (unlikely(f2fs_cp_error(sbi)))
return -EIO;
if (unlikely(f2fs_cp_error(sbi))) {
err = -EIO;
goto fail;
}
err = dquot_initialize(dir);
err = f2fs_dquot_initialize(dir);
if (err)
return err;
err = dquot_initialize(inode);
goto fail;
err = f2fs_dquot_initialize(inode);
if (err)
return err;
goto fail;
de = f2fs_find_entry(dir, &dentry->d_name, &page);
if (!de) {
@ -601,7 +622,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
* invalidating the dentries here, alongside with returning the
* negative dentries at f2fs_lookup(), when it is better
* negative dentries at f2fs_lookup(), when it is better
* supported by the VFS for the CI case.
*/
if (IS_CASEFOLDED(dir))
@ -621,6 +642,7 @@ static const char *f2fs_get_link(struct dentry *dentry,
struct delayed_call *done)
{
const char *link = page_get_link(dentry, inode, done);
if (!IS_ERR(link) && !*link) {
/* this is broken symlink case */
do_delayed_call(done);
@ -649,7 +671,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
if (err)
return err;
err = dquot_initialize(dir);
err = f2fs_dquot_initialize(dir);
if (err)
return err;
@ -706,7 +728,7 @@ out_f2fs_handle_failed_inode:
f2fs_handle_failed_inode(inode);
out_free_encrypted_link:
if (disk_link.name != (unsigned char *)symname)
kvfree(disk_link.name);
kfree(disk_link.name);
return err;
}
@ -719,7 +741,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
if (unlikely(f2fs_cp_error(sbi)))
return -EIO;
err = dquot_initialize(dir);
err = f2fs_dquot_initialize(dir);
if (err)
return err;
@ -730,7 +752,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
inode_nohighmem(inode);
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
set_inode_flag(inode, FI_INC_LINK);
f2fs_lock_op(sbi);
@ -758,6 +780,7 @@ out_fail:
static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
if (f2fs_empty_dir(inode))
return f2fs_unlink(dir, dentry);
return -ENOTEMPTY;
@ -775,7 +798,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
if (!f2fs_is_checkpoint_ready(sbi))
return -ENOSPC;
err = dquot_initialize(dir);
err = f2fs_dquot_initialize(dir);
if (err)
return err;
@ -813,7 +836,7 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
struct inode *inode;
int err;
err = dquot_initialize(dir);
err = f2fs_dquot_initialize(dir);
if (err)
return err;
@ -848,7 +871,11 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
if (whiteout) {
f2fs_i_links_write(inode, false);
spin_lock(&inode->i_lock);
inode->i_state |= I_LINKABLE;
spin_unlock(&inode->i_lock);
*whiteout = inode;
} else {
d_tmpfile(dentry, inode);
@ -932,16 +959,16 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
return err;
}
err = dquot_initialize(old_dir);
err = f2fs_dquot_initialize(old_dir);
if (err)
goto out;
err = dquot_initialize(new_dir);
err = f2fs_dquot_initialize(new_dir);
if (err)
goto out;
if (new_inode) {
err = dquot_initialize(new_inode);
err = f2fs_dquot_initialize(new_inode);
if (err)
goto out;
}
@ -990,11 +1017,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
new_page = NULL;
new_inode->i_ctime = current_time(new_inode);
down_write(&F2FS_I(new_inode)->i_sem);
f2fs_down_write(&F2FS_I(new_inode)->i_sem);
if (old_dir_entry)
f2fs_i_links_write(new_inode, false);
f2fs_i_links_write(new_inode, false);
up_write(&F2FS_I(new_inode)->i_sem);
f2fs_up_write(&F2FS_I(new_inode)->i_sem);
if (!new_inode->i_nlink)
f2fs_add_orphan_inode(new_inode);
@ -1015,13 +1042,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_i_links_write(new_dir, true);
}
down_write(&F2FS_I(old_inode)->i_sem);
f2fs_down_write(&F2FS_I(old_inode)->i_sem);
if (!old_dir_entry || whiteout)
file_lost_pino(old_inode);
else
/* adjust dir's i_pino to pass fsck check */
f2fs_i_pino_write(old_inode, new_dir->i_ino);
up_write(&F2FS_I(old_inode)->i_sem);
f2fs_up_write(&F2FS_I(old_inode)->i_sem);
old_inode->i_ctime = current_time(old_inode);
f2fs_mark_inode_dirty_sync(old_inode, false);
@ -1034,7 +1061,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
err = f2fs_add_link(old_dentry, whiteout);
if (err)
goto put_out_dir;
spin_lock(&whiteout->i_lock);
whiteout->i_state &= ~I_LINKABLE;
spin_unlock(&whiteout->i_lock);
iput(whiteout);
}
@ -1070,8 +1101,7 @@ out_dir:
out_old:
f2fs_put_page(old_page, 0);
out:
if (whiteout)
iput(whiteout);
iput(whiteout);
return err;
}
@ -1101,11 +1131,11 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
F2FS_I(new_dentry->d_inode)->i_projid)))
return -EXDEV;
err = dquot_initialize(old_dir);
err = f2fs_dquot_initialize(old_dir);
if (err)
goto out;
err = dquot_initialize(new_dir);
err = f2fs_dquot_initialize(new_dir);
if (err)
goto out;
@ -1177,38 +1207,38 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
/* update directory entry info of old dir inode */
f2fs_set_link(old_dir, old_entry, old_page, new_inode);
down_write(&F2FS_I(old_inode)->i_sem);
f2fs_down_write(&F2FS_I(old_inode)->i_sem);
if (!old_dir_entry)
file_lost_pino(old_inode);
else
/* adjust dir's i_pino to pass fsck check */
f2fs_i_pino_write(old_inode, new_dir->i_ino);
up_write(&F2FS_I(old_inode)->i_sem);
f2fs_up_write(&F2FS_I(old_inode)->i_sem);
old_dir->i_ctime = current_time(old_dir);
if (old_nlink) {
down_write(&F2FS_I(old_dir)->i_sem);
f2fs_down_write(&F2FS_I(old_dir)->i_sem);
f2fs_i_links_write(old_dir, old_nlink > 0);
up_write(&F2FS_I(old_dir)->i_sem);
f2fs_up_write(&F2FS_I(old_dir)->i_sem);
}
f2fs_mark_inode_dirty_sync(old_dir, false);
/* update directory entry info of new dir inode */
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
down_write(&F2FS_I(new_inode)->i_sem);
f2fs_down_write(&F2FS_I(new_inode)->i_sem);
if (!new_dir_entry)
file_lost_pino(new_inode);
else
/* adjust dir's i_pino to pass fsck check */
f2fs_i_pino_write(new_inode, old_dir->i_ino);
up_write(&F2FS_I(new_inode)->i_sem);
f2fs_up_write(&F2FS_I(new_inode)->i_sem);
new_dir->i_ctime = current_time(new_dir);
if (new_nlink) {
down_write(&F2FS_I(new_dir)->i_sem);
f2fs_down_write(&F2FS_I(new_dir)->i_sem);
f2fs_i_links_write(new_dir, new_nlink > 0);
up_write(&F2FS_I(new_dir)->i_sem);
f2fs_up_write(&F2FS_I(new_dir)->i_sem);
}
f2fs_mark_inode_dirty_sync(new_dir, false);
@ -1286,7 +1316,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
}
const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
.get_link = f2fs_encrypted_get_link,
.get_link = f2fs_encrypted_get_link,
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
.listxattr = f2fs_listxattr,
@ -1312,7 +1342,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
};
const struct inode_operations f2fs_symlink_inode_operations = {
.get_link = f2fs_get_link,
.get_link = f2fs_get_link,
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
.listxattr = f2fs_listxattr,
@ -1320,7 +1350,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
const struct inode_operations f2fs_special_inode_operations = {
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
.setattr = f2fs_setattr,
.get_acl = f2fs_get_acl,
.set_acl = f2fs_set_acl,
.listxattr = f2fs_listxattr,

View File

@ -17,7 +17,7 @@
#include "node.h"
#include "segment.h"
#include "xattr.h"
#include "trace.h"
#include "iostat.h"
#include <trace/events/f2fs.h>
#define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock)
@ -44,11 +44,15 @@ int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
struct sysinfo val;
unsigned long avail_ram;
unsigned long mem_size = 0;
bool res = false;
if (!nm_i)
return true;
si_meminfo(&val);
/* only uses low memory */
@ -62,8 +66,8 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
sizeof(struct free_nid)) >> PAGE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
} else if (type == NAT_ENTRIES) {
mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >>
PAGE_SHIFT;
mem_size = (nm_i->nat_cnt[TOTAL_NAT] *
sizeof(struct nat_entry)) >> PAGE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
if (excess_cached_nats(sbi))
res = false;
@ -90,6 +94,24 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
/* it allows 20% / total_ram for inmemory pages */
mem_size = get_pages(sbi, F2FS_INMEM_PAGES);
res = mem_size < (val.totalram / 5);
} else if (type == DISCARD_CACHE) {
mem_size = (atomic_read(&dcc->discard_cmd_cnt) *
sizeof(struct discard_cmd)) >> PAGE_SHIFT;
res = mem_size < (avail_ram * nm_i->ram_thresh / 100);
} else if (type == COMPRESS_PAGE) {
#ifdef CONFIG_F2FS_FS_COMPRESSION
unsigned long free_ram = val.freeram;
/*
* free memory is lower than watermark or cached page count
* exceed threshold, deny caching compress page.
*/
res = (free_ram > avail_ram * sbi->compress_watermark / 100) &&
(COMPRESS_MAPPING(sbi)->nrpages <
free_ram * sbi->compress_percent / 100);
#else
res = false;
#endif
} else {
if (!sbi->sb->s_bdi->wb.dirty_exceeded)
return true;
@ -109,7 +131,7 @@ static void clear_node_page_dirty(struct page *page)
static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
{
return f2fs_get_meta_page_nofail(sbi, current_nat_addr(sbi, nid));
return f2fs_get_meta_page_retry(sbi, current_nat_addr(sbi, nid));
}
static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
@ -141,14 +163,13 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
return dst_page;
}
static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail)
static struct nat_entry *__alloc_nat_entry(struct f2fs_sb_info *sbi,
nid_t nid, bool no_fail)
{
struct nat_entry *new;
if (no_fail)
new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO);
else
new = kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO);
new = f2fs_kmem_cache_alloc(nat_entry_slab,
GFP_F2FS_ZERO, no_fail, sbi);
if (new) {
nat_set_nid(new, nid);
nat_reset_flag(new);
@ -177,7 +198,8 @@ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i,
list_add_tail(&ne->list, &nm_i->nat_entries);
spin_unlock(&nm_i->nat_list_lock);
nm_i->nat_cnt++;
nm_i->nat_cnt[TOTAL_NAT]++;
nm_i->nat_cnt[RECLAIMABLE_NAT]++;
return ne;
}
@ -207,7 +229,8 @@ static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i,
static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
{
radix_tree_delete(&nm_i->nat_root, nat_get_nid(e));
nm_i->nat_cnt--;
nm_i->nat_cnt[TOTAL_NAT]--;
nm_i->nat_cnt[RECLAIMABLE_NAT]--;
__free_nat_entry(e);
}
@ -219,7 +242,8 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
head = radix_tree_lookup(&nm_i->nat_set_root, set);
if (!head) {
head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS);
head = f2fs_kmem_cache_alloc(nat_entry_set_slab,
GFP_NOFS, true, NULL);
INIT_LIST_HEAD(&head->entry_list);
INIT_LIST_HEAD(&head->set_list);
@ -253,7 +277,8 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
if (get_nat_flag(ne, IS_DIRTY))
goto refresh_list;
nm_i->dirty_nat_cnt++;
nm_i->nat_cnt[DIRTY_NAT]++;
nm_i->nat_cnt[RECLAIMABLE_NAT]--;
set_nat_flag(ne, IS_DIRTY, true);
refresh_list:
spin_lock(&nm_i->nat_list_lock);
@ -273,7 +298,8 @@ static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
set_nat_flag(ne, IS_DIRTY, false);
set->entry_cnt--;
nm_i->dirty_nat_cnt--;
nm_i->nat_cnt[DIRTY_NAT]--;
nm_i->nat_cnt[RECLAIMABLE_NAT]++;
}
static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
@ -304,7 +330,8 @@ static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi,
unsigned long flags;
unsigned int seq_id;
fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, GFP_NOFS);
fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab,
GFP_NOFS, true, NULL);
get_page(page);
fn->page = page;
@ -355,14 +382,14 @@ int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
struct nat_entry *e;
bool need = false;
down_read(&nm_i->nat_tree_lock);
f2fs_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e) {
if (!get_nat_flag(e, IS_CHECKPOINTED) &&
!get_nat_flag(e, HAS_FSYNCED_INODE))
need = true;
}
up_read(&nm_i->nat_tree_lock);
f2fs_up_read(&nm_i->nat_tree_lock);
return need;
}
@ -372,11 +399,11 @@ bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
struct nat_entry *e;
bool is_cp = true;
down_read(&nm_i->nat_tree_lock);
f2fs_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e && !get_nat_flag(e, IS_CHECKPOINTED))
is_cp = false;
up_read(&nm_i->nat_tree_lock);
f2fs_up_read(&nm_i->nat_tree_lock);
return is_cp;
}
@ -386,13 +413,13 @@ bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
struct nat_entry *e;
bool need_update = true;
down_read(&nm_i->nat_tree_lock);
f2fs_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ino);
if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
(get_nat_flag(e, IS_CHECKPOINTED) ||
get_nat_flag(e, HAS_FSYNCED_INODE)))
need_update = false;
up_read(&nm_i->nat_tree_lock);
f2fs_up_read(&nm_i->nat_tree_lock);
return need_update;
}
@ -403,11 +430,15 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *new, *e;
new = __alloc_nat_entry(nid, false);
/* Let's mitigate lock contention of nat_tree_lock during checkpoint */
if (f2fs_rwsem_is_locked(&sbi->cp_global_sem))
return;
new = __alloc_nat_entry(sbi, nid, false);
if (!new)
return;
down_write(&nm_i->nat_tree_lock);
f2fs_down_write(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (!e)
e = __init_nat_entry(nm_i, new, ne, false);
@ -416,7 +447,7 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
nat_get_blkaddr(e) !=
le32_to_cpu(ne->block_addr) ||
nat_get_version(e) != ne->version);
up_write(&nm_i->nat_tree_lock);
f2fs_up_write(&nm_i->nat_tree_lock);
if (e != new)
__free_nat_entry(new);
}
@ -426,9 +457,9 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
struct nat_entry *new = __alloc_nat_entry(ni->nid, true);
struct nat_entry *new = __alloc_nat_entry(sbi, ni->nid, true);
down_write(&nm_i->nat_tree_lock);
f2fs_down_write(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ni->nid);
if (!e) {
e = __init_nat_entry(nm_i, new, NULL, true);
@ -459,6 +490,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
/* increment version no as node is removed */
if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
unsigned char version = nat_get_version(e);
nat_set_version(e, inc_node_version(version));
}
@ -476,7 +508,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
set_nat_flag(e, HAS_FSYNCED_INODE, true);
set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
}
up_write(&nm_i->nat_tree_lock);
f2fs_up_write(&nm_i->nat_tree_lock);
}
int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
@ -484,7 +516,7 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
struct f2fs_nm_info *nm_i = NM_I(sbi);
int nr = nr_shrink;
if (!down_write_trylock(&nm_i->nat_tree_lock))
if (!f2fs_down_write_trylock(&nm_i->nat_tree_lock))
return 0;
spin_lock(&nm_i->nat_list_lock);
@ -506,12 +538,12 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
}
spin_unlock(&nm_i->nat_list_lock);
up_write(&nm_i->nat_tree_lock);
f2fs_up_write(&nm_i->nat_tree_lock);
return nr - nr_shrink;
}
int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
struct node_info *ni)
struct node_info *ni, bool checkpoint_context)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
@ -526,36 +558,46 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
int i;
ni->nid = nid;
retry:
/* Check nat cache */
down_read(&nm_i->nat_tree_lock);
f2fs_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e) {
ni->ino = nat_get_ino(e);
ni->blk_addr = nat_get_blkaddr(e);
ni->version = nat_get_version(e);
up_read(&nm_i->nat_tree_lock);
f2fs_up_read(&nm_i->nat_tree_lock);
return 0;
}
memset(&ne, 0, sizeof(struct f2fs_nat_entry));
/*
* Check current segment summary by trying to grab journal_rwsem first.
* This sem is on the critical path on the checkpoint requiring the above
* nat_tree_lock. Therefore, we should retry, if we failed to grab here
* while not bothering checkpoint.
*/
if (!f2fs_rwsem_is_locked(&sbi->cp_global_sem) || checkpoint_context) {
down_read(&curseg->journal_rwsem);
} else if (f2fs_rwsem_is_contended(&nm_i->nat_tree_lock) ||
!down_read_trylock(&curseg->journal_rwsem)) {
f2fs_up_read(&nm_i->nat_tree_lock);
goto retry;
}
/* Check current segment summary */
down_read(&curseg->journal_rwsem);
i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
if (i >= 0) {
ne = nat_in_journal(journal, i);
node_info_from_raw_nat(ni, &ne);
}
up_read(&curseg->journal_rwsem);
up_read(&curseg->journal_rwsem);
if (i >= 0) {
up_read(&nm_i->nat_tree_lock);
f2fs_up_read(&nm_i->nat_tree_lock);
goto cache;
}
/* Fill node_info from nat page */
index = current_nat_addr(sbi, nid);
up_read(&nm_i->nat_tree_lock);
f2fs_up_read(&nm_i->nat_tree_lock);
page = f2fs_get_meta_page(sbi, index);
if (IS_ERR(page))
@ -804,6 +846,26 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
dn->ofs_in_node = offset[level];
dn->node_page = npage[level];
dn->data_blkaddr = f2fs_data_blkaddr(dn);
if (is_inode_flag_set(dn->inode, FI_COMPRESSED_FILE) &&
f2fs_sb_has_readonly(sbi)) {
unsigned int c_len = f2fs_cluster_blocks_are_contiguous(dn);
block_t blkaddr;
if (!c_len)
goto out;
blkaddr = f2fs_data_blkaddr(dn);
if (blkaddr == COMPRESS_ADDR)
blkaddr = data_blkaddr(dn->inode, dn->node_page,
dn->ofs_in_node + 1);
f2fs_update_extent_tree_range_compressed(dn->inode,
index, blkaddr,
F2FS_I(dn->inode)->i_cluster_size,
c_len);
}
out:
return 0;
release_pages:
@ -828,7 +890,7 @@ static int truncate_node(struct dnode_of_data *dn)
int err;
pgoff_t index;
err = f2fs_get_node_info(sbi, dn->nid, &ni);
err = f2fs_get_node_info(sbi, dn->nid, &ni, false);
if (err)
return err;
@ -1039,8 +1101,10 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from)
trace_f2fs_truncate_inode_blocks_enter(inode, from);
level = get_node_path(inode, from, offset, noffset);
if (level < 0)
if (level < 0) {
trace_f2fs_truncate_inode_blocks_exit(inode, level);
return level;
}
page = f2fs_get_node_page(sbi, inode->i_ino);
if (IS_ERR(page)) {
@ -1225,7 +1289,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs)
goto fail;
#ifdef CONFIG_F2FS_CHECK_FS
err = f2fs_get_node_info(sbi, dn->nid, &new_ni);
err = f2fs_get_node_info(sbi, dn->nid, &new_ni, false);
if (err) {
dec_valid_node_count(sbi, dn->inode, !ofs);
goto fail;
@ -1287,11 +1351,12 @@ static int read_node_page(struct page *page, int op_flags)
return LOCKED_PAGE;
}
err = f2fs_get_node_info(sbi, page->index, &ni);
err = f2fs_get_node_info(sbi, page->index, &ni, false);
if (err)
return err;
if (unlikely(ni.blk_addr == NULL_ADDR) ||
/* NEW_ADDR can be seen, after cp_error drops some dirty node pages */
if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR) ||
is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) {
ClearPageUptodate(page);
return -ENOENT;
@ -1378,11 +1443,12 @@ repeat:
goto out_err;
}
page_hit:
if(unlikely(nid != nid_of_node(page))) {
if (unlikely(nid != nid_of_node(page))) {
f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
nid, nid_of_node(page), ino_of_node(page),
ofs_of_node(page), cpver_of_node(page),
next_blkaddr_of_node(page));
set_sbi_flag(sbi, SBI_NEED_FSCK);
err = -EINVAL;
out_err:
ClearPageUptodate(page);
@ -1521,13 +1587,10 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
trace_f2fs_writepage(page, NODE);
if (unlikely(f2fs_cp_error(sbi))) {
if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) {
ClearPageUptodate(page);
dec_page_count(sbi, F2FS_DIRTY_NODES);
unlock_page(page);
return 0;
}
goto redirty_out;
ClearPageUptodate(page);
dec_page_count(sbi, F2FS_DIRTY_NODES);
unlock_page(page);
return 0;
}
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
@ -1542,21 +1605,21 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
nid = nid_of_node(page);
f2fs_bug_on(sbi, page->index != nid);
if (f2fs_get_node_info(sbi, nid, &ni))
if (f2fs_get_node_info(sbi, nid, &ni, !do_balance))
goto redirty_out;
if (wbc->for_reclaim) {
if (!down_read_trylock(&sbi->node_write))
if (!f2fs_down_read_trylock(&sbi->node_write))
goto redirty_out;
} else {
down_read(&sbi->node_write);
f2fs_down_read(&sbi->node_write);
}
/* This page is already truncated */
if (unlikely(ni.blk_addr == NULL_ADDR)) {
ClearPageUptodate(page);
dec_page_count(sbi, F2FS_DIRTY_NODES);
up_read(&sbi->node_write);
f2fs_up_read(&sbi->node_write);
unlock_page(page);
return 0;
}
@ -1564,7 +1627,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
if (__is_valid_data_blkaddr(ni.blk_addr) &&
!f2fs_is_valid_blkaddr(sbi, ni.blk_addr,
DATA_GENERIC_ENHANCE)) {
up_read(&sbi->node_write);
f2fs_up_read(&sbi->node_write);
goto redirty_out;
}
@ -1585,7 +1648,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
f2fs_do_write_node_page(nid, &fio);
set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
dec_page_count(sbi, F2FS_DIRTY_NODES);
up_read(&sbi->node_write);
f2fs_up_read(&sbi->node_write);
if (wbc->for_reclaim) {
f2fs_submit_merged_write_cond(sbi, NULL, page, 0, NODE);
@ -1719,6 +1782,7 @@ continue_unlock:
if (!atomic || page == last_page) {
set_fsync_mark(page, 1);
percpu_counter_inc(&sbi->rf_node_block_count);
if (IS_INODE(page)) {
if (is_inode_flag_set(inode,
FI_DIRTY_INODE))
@ -1726,7 +1790,7 @@ continue_unlock:
set_dentry_mark(page,
f2fs_need_dentry_mark(sbi, ino));
}
/* may be written by other thread */
/* may be written by other thread */
if (!PageDirty(page))
set_page_dirty(page);
}
@ -1770,7 +1834,7 @@ continue_unlock:
out:
if (nwritten)
f2fs_submit_merged_write_cond(sbi, NULL, NULL, ino, NODE);
return ret ? -EIO: 0;
return ret ? -EIO : 0;
}
static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data)
@ -1814,12 +1878,11 @@ static bool flush_dirty_inode(struct page *page)
return true;
}
int f2fs_flush_inline_data(struct f2fs_sb_info *sbi)
void f2fs_flush_inline_data(struct f2fs_sb_info *sbi)
{
pgoff_t index = 0;
struct pagevec pvec;
int nr_pages;
int ret = 0;
pagevec_init(&pvec);
@ -1847,8 +1910,8 @@ continue_unlock:
}
/* flush inline_data, if it's async context. */
if (is_inline_node(page)) {
clear_inline_node(page);
if (page_private_inline(page)) {
clear_page_private_inline(page);
unlock_page(page);
flush_inline_data(sbi, ino_of_node(page));
continue;
@ -1858,7 +1921,6 @@ continue_unlock:
pagevec_release(&pvec);
cond_resched();
}
return ret;
}
int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
@ -1924,9 +1986,13 @@ continue_unlock:
goto continue_unlock;
}
/* flush inline_data, if it's async context. */
if (do_balance && is_inline_node(page)) {
clear_inline_node(page);
/* flush inline_data/inode, if it's async context. */
if (!do_balance)
goto write_node;
/* flush inline_data */
if (page_private_inline(page)) {
clear_page_private_inline(page);
unlock_page(page);
flush_inline_data(sbi, ino_of_node(page));
goto lock_node;
@ -1938,7 +2004,7 @@ continue_unlock:
if (flush_dirty_inode(page))
goto lock_node;
}
write_node:
f2fs_wait_on_page_writeback(page, NODE, true, true);
if (!clear_page_dirty_for_io(page))
@ -2046,8 +2112,12 @@ static int f2fs_write_node_pages(struct address_space *mapping,
if (wbc->sync_mode == WB_SYNC_ALL)
atomic_inc(&sbi->wb_sync_req[NODE]);
else if (atomic_read(&sbi->wb_sync_req[NODE]))
else if (atomic_read(&sbi->wb_sync_req[NODE])) {
/* to avoid potential deadlock */
if (current->plug)
blk_finish_plug(current->plug);
goto skip_write;
}
trace_f2fs_writepages(mapping->host, wbc, NODE);
@ -2080,8 +2150,7 @@ static int f2fs_set_node_page_dirty(struct page *page)
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
f2fs_set_page_private(page, 0);
f2fs_trace_pid(page);
set_page_private_reference(page);
return 1;
}
return 0;
@ -2097,7 +2166,7 @@ const struct address_space_operations f2fs_node_aops = {
.invalidatepage = f2fs_invalidate_page,
.releasepage = f2fs_release_page,
#ifdef CONFIG_MIGRATION
.migratepage = f2fs_migrate_page,
.migratepage = f2fs_migrate_page,
#endif
};
@ -2108,18 +2177,16 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
}
static int __insert_free_nid(struct f2fs_sb_info *sbi,
struct free_nid *i, enum nid_state state)
struct free_nid *i)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i);
if (err)
return err;
f2fs_bug_on(sbi, state != i->state);
nm_i->nid_cnt[state]++;
if (state == FREE_NID)
list_add_tail(&i->list, &nm_i->free_nid_list);
nm_i->nid_cnt[FREE_NID]++;
list_add_tail(&i->list, &nm_i->free_nid_list);
return 0;
}
@ -2157,6 +2224,24 @@ static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i,
}
}
bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
unsigned int i;
bool ret = true;
f2fs_down_read(&nm_i->nat_tree_lock);
for (i = 0; i < nm_i->nat_blocks; i++) {
if (!test_bit_le(i, nm_i->nat_block_bitmap)) {
ret = false;
break;
}
}
f2fs_up_read(&nm_i->nat_tree_lock);
return ret;
}
static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
bool set, bool build)
{
@ -2198,7 +2283,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi,
if (unlikely(f2fs_check_nid_range(sbi, nid)))
return false;
i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS);
i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS, true, NULL);
i->nid = nid;
i->state = FREE_NID;
@ -2241,7 +2326,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi,
}
}
ret = true;
err = __insert_free_nid(sbi, i, FREE_NID);
err = __insert_free_nid(sbi, i);
err_out:
if (update) {
update_free_nid_bitmap(sbi, nid, ret, build);
@ -2335,7 +2420,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
unsigned int i, idx;
nid_t nid;
down_read(&nm_i->nat_tree_lock);
f2fs_down_read(&nm_i->nat_tree_lock);
for (i = 0; i < nm_i->nat_blocks; i++) {
if (!test_bit_le(i, nm_i->nat_block_bitmap))
@ -2358,7 +2443,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
out:
scan_curseg_cache(sbi);
up_read(&nm_i->nat_tree_lock);
f2fs_up_read(&nm_i->nat_tree_lock);
}
static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
@ -2393,7 +2478,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
META_NAT, true);
down_read(&nm_i->nat_tree_lock);
f2fs_down_read(&nm_i->nat_tree_lock);
while (1) {
if (!test_bit_le(NAT_BLOCK_OFFSET(nid),
@ -2408,7 +2493,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
}
if (ret) {
up_read(&nm_i->nat_tree_lock);
f2fs_up_read(&nm_i->nat_tree_lock);
f2fs_err(sbi, "NAT is corrupt, run fsck to fix it");
return ret;
}
@ -2428,7 +2513,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
/* find free nids from current sum_pages */
scan_curseg_cache(sbi);
up_read(&nm_i->nat_tree_lock);
f2fs_up_read(&nm_i->nat_tree_lock);
f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
nm_i->ra_nid_pages, META_NAT, false);
@ -2575,7 +2660,7 @@ int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
return nr - nr_shrink;
}
void f2fs_recover_inline_xattr(struct inode *inode, struct page *page)
int f2fs_recover_inline_xattr(struct inode *inode, struct page *page)
{
void *src_addr, *dst_addr;
size_t inline_size;
@ -2583,13 +2668,20 @@ void f2fs_recover_inline_xattr(struct inode *inode, struct page *page)
struct f2fs_inode *ri;
ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage));
if (IS_ERR(ipage))
return PTR_ERR(ipage);
ri = F2FS_INODE(page);
if (ri->i_inline & F2FS_INLINE_XATTR) {
set_inode_flag(inode, FI_INLINE_XATTR);
if (!f2fs_has_inline_xattr(inode)) {
set_inode_flag(inode, FI_INLINE_XATTR);
stat_inc_inline_xattr(inode);
}
} else {
clear_inode_flag(inode, FI_INLINE_XATTR);
if (f2fs_has_inline_xattr(inode)) {
stat_dec_inline_xattr(inode);
clear_inode_flag(inode, FI_INLINE_XATTR);
}
goto update_inode;
}
@ -2602,6 +2694,7 @@ void f2fs_recover_inline_xattr(struct inode *inode, struct page *page)
update_inode:
f2fs_update_inode(inode, ipage);
f2fs_put_page(ipage, 1);
return 0;
}
int f2fs_recover_xattr_data(struct inode *inode, struct page *page)
@ -2618,7 +2711,7 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page)
goto recover_xnid;
/* 1: invalidate the previous xattr nid */
err = f2fs_get_node_info(sbi, prev_xnid, &ni);
err = f2fs_get_node_info(sbi, prev_xnid, &ni, false);
if (err)
return err;
@ -2658,7 +2751,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
struct page *ipage;
int err;
err = f2fs_get_node_info(sbi, ino, &old_ni);
err = f2fs_get_node_info(sbi, ino, &old_ni, false);
if (err)
return err;
@ -2682,7 +2775,7 @@ retry:
src = F2FS_INODE(page);
dst = F2FS_INODE(ipage);
memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src);
memcpy(dst, src, offsetof(struct f2fs_inode, i_ext));
dst->i_size = 0;
dst->i_blocks = cpu_to_le64(1);
dst->i_links = cpu_to_le32(1);
@ -2773,11 +2866,14 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
struct f2fs_nat_entry raw_ne;
nid_t nid = le32_to_cpu(nid_in_journal(journal, i));
if (f2fs_check_nid_range(sbi, nid))
continue;
raw_ne = nat_in_journal(journal, i);
ne = __lookup_nat_cache(nm_i, nid);
if (!ne) {
ne = __alloc_nat_entry(nid, true);
ne = __alloc_nat_entry(sbi, nid, true);
__init_nat_entry(nm_i, ne, &raw_ne, true);
}
@ -2817,7 +2913,23 @@ add_out:
list_add_tail(&nes->set_list, head);
}
static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
static void __update_nat_bits(struct f2fs_nm_info *nm_i, unsigned int nat_ofs,
unsigned int valid)
{
if (valid == 0) {
__set_bit_le(nat_ofs, nm_i->empty_nat_bits);
__clear_bit_le(nat_ofs, nm_i->full_nat_bits);
return;
}
__clear_bit_le(nat_ofs, nm_i->empty_nat_bits);
if (valid == NAT_ENTRY_PER_BLOCK)
__set_bit_le(nat_ofs, nm_i->full_nat_bits);
else
__clear_bit_le(nat_ofs, nm_i->full_nat_bits);
}
static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
struct page *page)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
@ -2826,7 +2938,7 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
int valid = 0;
int i = 0;
if (!enabled_nat_bits(sbi, NULL))
if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
return;
if (nat_index == 0) {
@ -2837,17 +2949,36 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
if (le32_to_cpu(nat_blk->entries[i].block_addr) != NULL_ADDR)
valid++;
}
if (valid == 0) {
__set_bit_le(nat_index, nm_i->empty_nat_bits);
__clear_bit_le(nat_index, nm_i->full_nat_bits);
return;
__update_nat_bits(nm_i, nat_index, valid);
}
void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
unsigned int nat_ofs;
f2fs_down_read(&nm_i->nat_tree_lock);
for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) {
unsigned int valid = 0, nid_ofs = 0;
/* handle nid zero due to it should never be used */
if (unlikely(nat_ofs == 0)) {
valid = 1;
nid_ofs = 1;
}
for (; nid_ofs < NAT_ENTRY_PER_BLOCK; nid_ofs++) {
if (!test_bit_le(nid_ofs,
nm_i->free_nid_bitmap[nat_ofs]))
valid++;
}
__update_nat_bits(nm_i, nat_ofs, valid);
}
__clear_bit_le(nat_index, nm_i->empty_nat_bits);
if (valid == NAT_ENTRY_PER_BLOCK)
__set_bit_le(nat_index, nm_i->full_nat_bits);
else
__clear_bit_le(nat_index, nm_i->full_nat_bits);
f2fs_up_read(&nm_i->nat_tree_lock);
}
static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
@ -2866,7 +2997,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
* #1, flush nat entries to journal in current hot data summary block.
* #2, flush nat entries to nat page.
*/
if (enabled_nat_bits(sbi, cpc) ||
if ((cpc->reason & CP_UMOUNT) ||
!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
to_journal = false;
@ -2913,7 +3044,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
if (to_journal) {
up_write(&curseg->journal_rwsem);
} else {
__update_nat_bits(sbi, start_nid, page);
update_nat_bits(sbi, start_nid, page);
f2fs_put_page(page, 1);
}
@ -2940,30 +3071,35 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
LIST_HEAD(sets);
int err = 0;
/* during unmount, let's flush nat_bits before checking dirty_nat_cnt */
if (enabled_nat_bits(sbi, cpc)) {
down_write(&nm_i->nat_tree_lock);
/*
* during unmount, let's flush nat_bits before checking
* nat_cnt[DIRTY_NAT].
*/
if (cpc->reason & CP_UMOUNT) {
f2fs_down_write(&nm_i->nat_tree_lock);
remove_nats_in_journal(sbi);
up_write(&nm_i->nat_tree_lock);
f2fs_up_write(&nm_i->nat_tree_lock);
}
if (!nm_i->dirty_nat_cnt)
if (!nm_i->nat_cnt[DIRTY_NAT])
return 0;
down_write(&nm_i->nat_tree_lock);
f2fs_down_write(&nm_i->nat_tree_lock);
/*
* if there are no enough space in journal to store dirty nat
* entries, remove all entries from journal and merge them
* into nat entry set.
*/
if (enabled_nat_bits(sbi, cpc) ||
!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
if (cpc->reason & CP_UMOUNT ||
!__has_cursum_space(journal,
nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL))
remove_nats_in_journal(sbi);
while ((found = __gang_lookup_nat_set(nm_i,
set_idx, SETVEC_SIZE, setvec))) {
unsigned idx;
set_idx = setvec[found - 1]->set + 1;
for (idx = 0; idx < found; idx++)
__adjust_nat_entry_set(setvec[idx], &sets,
@ -2977,7 +3113,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
break;
}
up_write(&nm_i->nat_tree_lock);
f2fs_up_write(&nm_i->nat_tree_lock);
/* Allow dirty nats by node block allocation in write_begin */
return err;
@ -2992,15 +3128,18 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
__u64 cp_ver = cur_cp_version(ckpt);
block_t nat_bits_addr;
if (!enabled_nat_bits(sbi, NULL))
return 0;
nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8);
nm_i->nat_bits = f2fs_kvzalloc(sbi,
nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL);
if (!nm_i->nat_bits)
return -ENOMEM;
nm_i->full_nat_bits = nm_i->nat_bits + 8;
nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes;
if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
return 0;
nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg -
nm_i->nat_bits_blocks;
for (i = 0; i < nm_i->nat_bits_blocks; i++) {
@ -3017,13 +3156,12 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
cp_ver |= (cur_cp_crc(ckpt) << 32);
if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) {
disable_nat_bits(sbi, true);
clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
f2fs_notice(sbi, "Disable nat_bits due to incorrect cp_ver (%llu, %llu)",
cp_ver, le64_to_cpu(*(__le64 *)nm_i->nat_bits));
return 0;
}
nm_i->full_nat_bits = nm_i->nat_bits + 8;
nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes;
f2fs_notice(sbi, "Found nat_bits in checkpoint");
return 0;
}
@ -3034,7 +3172,7 @@ static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi)
unsigned int i = 0;
nid_t nid, last_nid;
if (!enabled_nat_bits(sbi, NULL))
if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
return;
for (i = 0; i < nm_i->nat_blocks; i++) {
@ -3082,10 +3220,10 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
F2FS_RESERVED_NODE_NUM;
nm_i->nid_cnt[FREE_NID] = 0;
nm_i->nid_cnt[PREALLOC_NID] = 0;
nm_i->nat_cnt = 0;
nm_i->ram_thresh = DEF_RAM_THRESHOLD;
nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
nm_i->max_rf_node_blocks = DEF_RF_NODE_BLOCKS;
INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
INIT_LIST_HEAD(&nm_i->free_nid_list);
@ -3096,14 +3234,11 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
mutex_init(&nm_i->build_lock);
spin_lock_init(&nm_i->nid_list_lock);
init_rwsem(&nm_i->nat_tree_lock);
init_f2fs_rwsem(&nm_i->nat_tree_lock);
nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP);
if (!version_bitmap)
return -EFAULT;
nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size,
GFP_KERNEL);
if (!nm_i->nat_bitmap)
@ -3205,7 +3340,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
spin_unlock(&nm_i->nid_list_lock);
/* destroy nat cache */
down_write(&nm_i->nat_tree_lock);
f2fs_down_write(&nm_i->nat_tree_lock);
while ((found = __gang_lookup_nat_cache(nm_i,
nid, NATVEC_SIZE, natvec))) {
unsigned idx;
@ -3219,7 +3354,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
__del_from_nat_cache(nm_i, natvec[idx]);
}
}
f2fs_bug_on(sbi, nm_i->nat_cnt);
f2fs_bug_on(sbi, nm_i->nat_cnt[TOTAL_NAT]);
/* destroy nat set cache */
nid = 0;
@ -3235,7 +3370,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
kmem_cache_free(nat_entry_set_slab, setvec[idx]);
}
}
up_write(&nm_i->nat_tree_lock);
f2fs_up_write(&nm_i->nat_tree_lock);
kvfree(nm_i->nat_block_bitmap);
if (nm_i->free_nid_bitmap) {
@ -3253,7 +3388,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
kvfree(nm_i->nat_bitmap_mir);
#endif
sbi->nm_info = NULL;
kvfree(nm_i);
kfree(nm_i);
}
int __init f2fs_create_node_manager_caches(void)

View File

@ -31,6 +31,9 @@
/* control total # of nats */
#define DEF_NAT_CACHE_THRESHOLD 100000
/* control total # of node writes used for roll-fowrad recovery */
#define DEF_RF_NODE_BLOCKS 0
/* vector size for gang look-up from nat cache that consists of radix tree */
#define NATVEC_SIZE 64
#define SETVEC_SIZE 32
@ -38,6 +41,9 @@
/* return value for read_node_page */
#define LOCKED_PAGE 1
/* check pinned file's alignment status of physical blocks */
#define FILE_NOT_ALIGNED 1
/* For flag in struct node_info */
enum {
IS_CHECKPOINTED, /* is it checkpointed before? */
@ -126,18 +132,13 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne,
static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi)
{
return NM_I(sbi)->dirty_nat_cnt >= NM_I(sbi)->max_nid *
return NM_I(sbi)->nat_cnt[DIRTY_NAT] >= NM_I(sbi)->max_nid *
NM_I(sbi)->dirty_nats_ratio / 100;
}
static inline bool excess_cached_nats(struct f2fs_sb_info *sbi)
{
return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD;
}
static inline bool excess_dirty_nodes(struct f2fs_sb_info *sbi)
{
return get_pages(sbi, F2FS_DIRTY_NODES) >= sbi->blocks_per_seg * 8;
return NM_I(sbi)->nat_cnt[TOTAL_NAT] >= DEF_NAT_CACHE_THRESHOLD;
}
enum mem_type {
@ -147,6 +148,8 @@ enum mem_type {
INO_ENTRIES, /* indicates inode entries */
EXTENT_CACHE, /* indicates extent cache */
INMEM_PAGES, /* indicates inmemory pages */
DISCARD_CACHE, /* indicates memory of cached discard cmds */
COMPRESS_PAGE, /* indicates memory of cached compressed pages */
BASE_CHECK, /* check kernel status */
};
@ -388,20 +391,6 @@ static inline nid_t get_nid(struct page *p, int off, bool i)
* - Mark cold node blocks in their node footer
* - Mark cold data pages in page cache
*/
static inline int is_cold_data(struct page *page)
{
return PageChecked(page);
}
static inline void set_cold_data(struct page *page)
{
SetPageChecked(page);
}
static inline void clear_cold_data(struct page *page)
{
ClearPageChecked(page);
}
static inline int is_node(struct page *page, int type)
{
@ -413,21 +402,6 @@ static inline int is_node(struct page *page, int type)
#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT)
#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT)
static inline int is_inline_node(struct page *page)
{
return PageChecked(page);
}
static inline void set_inline_node(struct page *page)
{
SetPageChecked(page);
}
static inline void clear_inline_node(struct page *page)
{
ClearPageChecked(page);
}
static inline void set_cold_node(struct page *page, bool is_dir)
{
struct f2fs_node *rn = F2FS_NODE(page);

View File

@ -45,12 +45,20 @@
static struct kmem_cache *fsync_entry_slab;
#ifdef CONFIG_UNICODE
extern struct kmem_cache *f2fs_cf_name_slab;
#endif
bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi)
{
s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count);
if (sbi->last_valid_block_count + nalloc > sbi->user_block_count)
return false;
if (NM_I(sbi)->max_rf_node_blocks &&
percpu_counter_sum_positive(&sbi->rf_node_block_count) >=
NM_I(sbi)->max_rf_node_blocks)
return false;
return true;
}
@ -77,7 +85,7 @@ static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi,
if (IS_ERR(inode))
return ERR_CAST(inode);
err = dquot_initialize(inode);
err = f2fs_dquot_initialize(inode);
if (err)
goto err_out;
@ -87,7 +95,8 @@ static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi,
goto err_out;
}
entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
entry = f2fs_kmem_cache_alloc(fsync_entry_slab,
GFP_F2FS_ZERO, true, NULL);
entry->inode = inode;
list_add_tail(&entry->list, head);
@ -145,7 +154,7 @@ static int init_recovered_filename(const struct inode *dir,
f2fs_hash_filename(dir, fname);
#ifdef CONFIG_UNICODE
/* Case-sensitive match is fine for recovery */
kfree(fname->cf_name.name);
kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name);
fname->cf_name.name = NULL;
#endif
} else {
@ -198,7 +207,7 @@ retry:
goto out_put;
}
err = dquot_initialize(einode);
err = f2fs_dquot_initialize(einode);
if (err) {
iput(einode);
goto out_put;
@ -337,6 +346,19 @@ static int recover_inode(struct inode *inode, struct page *page)
return 0;
}
static unsigned int adjust_por_ra_blocks(struct f2fs_sb_info *sbi,
unsigned int ra_blocks, unsigned int blkaddr,
unsigned int next_blkaddr)
{
if (blkaddr + 1 == next_blkaddr)
ra_blocks = min_t(unsigned int, RECOVERY_MAX_RA_BLOCKS,
ra_blocks * 2);
else if (next_blkaddr % sbi->blocks_per_seg)
ra_blocks = max_t(unsigned int, RECOVERY_MIN_RA_BLOCKS,
ra_blocks / 2);
return ra_blocks;
}
static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
bool check_only)
{
@ -344,6 +366,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
struct page *page = NULL;
block_t blkaddr;
unsigned int loop_cnt = 0;
unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS;
unsigned int free_blocks = MAIN_SEGS(sbi) * sbi->blocks_per_seg -
valid_user_blocks(sbi);
int err = 0;
@ -418,11 +441,14 @@ next:
break;
}
ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr,
next_blkaddr_of_node(page));
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
f2fs_put_page(page, 1);
f2fs_ra_meta_pages_cond(sbi, blkaddr);
f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks);
}
return err;
}
@ -458,6 +484,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
/* Get the previous summary */
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
struct curseg_info *curseg = CURSEG_I(sbi, i);
if (curseg->segno == segno) {
sum = curseg->sum_blk->entries[blkoff];
goto got_it;
@ -502,7 +529,7 @@ got_it:
if (IS_ERR(inode))
return PTR_ERR(inode);
ret = dquot_initialize(inode);
ret = f2fs_dquot_initialize(inode);
if (ret) {
iput(inode);
return ret;
@ -554,7 +581,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
/* step 1: recover xattr */
if (IS_INODE(page)) {
f2fs_recover_inline_xattr(inode, page);
err = f2fs_recover_inline_xattr(inode, page);
if (err)
goto out;
} else if (f2fs_has_xattr_block(ofs_of_node(page))) {
err = f2fs_recover_xattr_data(inode, page);
if (!err)
@ -563,8 +592,12 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
}
/* step 2: recover inline data */
if (f2fs_recover_inline_data(inode, page))
err = f2fs_recover_inline_data(inode, page);
if (err) {
if (err == 1)
err = 0;
goto out;
}
/* step 3: recover data indices */
start = f2fs_start_bidx_of_node(ofs_of_node(page), inode);
@ -583,7 +616,7 @@ retry_dn:
f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true);
err = f2fs_get_node_info(sbi, dn.nid, &ni);
err = f2fs_get_node_info(sbi, dn.nid, &ni, false);
if (err)
goto err;
@ -692,6 +725,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
struct page *page = NULL;
int err = 0;
block_t blkaddr;
unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS;
/* get node pages in the current segment */
curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
@ -703,8 +737,6 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR))
break;
f2fs_ra_meta_pages_cond(sbi, blkaddr);
page = f2fs_get_tmp_page(sbi, blkaddr);
if (IS_ERR(page)) {
err = PTR_ERR(page);
@ -747,12 +779,17 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
if (entry->blkaddr == blkaddr)
list_move_tail(&entry->list, tmp_inode_list);
next:
ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr,
next_blkaddr_of_node(page));
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
f2fs_put_page(page, 1);
f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks);
}
if (!err)
f2fs_allocate_new_segments(sbi, NO_CHECK_TYPE);
f2fs_allocate_new_segments(sbi);
return err;
}
@ -774,25 +811,16 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
}
#ifdef CONFIG_QUOTA
/* Needed for iput() to work correctly and not trash data */
sbi->sb->s_flags |= MS_ACTIVE;
/* Turn on quotas so that they are updated correctly */
quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY);
#endif
fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
sizeof(struct fsync_inode_entry));
if (!fsync_entry_slab) {
err = -ENOMEM;
goto out;
}
INIT_LIST_HEAD(&inode_list);
INIT_LIST_HEAD(&tmp_inode_list);
INIT_LIST_HEAD(&dir_list);
/* prevent checkpoint */
mutex_lock(&sbi->cp_mutex);
f2fs_down_write(&sbi->cp_global_sem);
/* step #1: find fsynced inode numbers */
err = find_fsync_dnodes(sbi, &inode_list, check_only);
@ -810,10 +838,8 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
err = recover_data(sbi, &inode_list, &tmp_inode_list, &dir_list);
if (!err)
f2fs_bug_on(sbi, !list_empty(&inode_list));
else {
/* restore s_flags to let iput() trash data */
sbi->sb->s_flags = s_flags;
}
else
f2fs_bug_on(sbi, sbi->sb->s_flags & SB_ACTIVE);
skip:
destroy_fsync_dnodes(&inode_list, err);
destroy_fsync_dnodes(&tmp_inode_list, err);
@ -828,7 +854,7 @@ skip:
} else {
clear_sbi_flag(sbi, SBI_POR_DOING);
}
mutex_unlock(&sbi->cp_mutex);
f2fs_up_write(&sbi->cp_global_sem);
/* let's drop all the directory inodes for clean checkpoint */
destroy_fsync_dnodes(&dir_list, err);
@ -844,8 +870,6 @@ skip:
}
}
kmem_cache_destroy(fsync_entry_slab);
out:
#ifdef CONFIG_QUOTA
/* Turn quotas off */
if (quota_enabled)
@ -853,5 +877,19 @@ out:
#endif
sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
return ret ? ret: err;
return ret ? ret : err;
}
int __init f2fs_create_recovery_cache(void)
{
fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
sizeof(struct fsync_inode_entry));
if (!fsync_entry_slab)
return -ENOMEM;
return 0;
}
void f2fs_destroy_recovery_cache(void)
{
kmem_cache_destroy(fsync_entry_slab);
}

File diff suppressed because it is too large Load Diff

View File

@ -16,13 +16,20 @@
#define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */
#define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */
#define F2FS_MIN_META_SEGMENTS 8 /* SB + 2 (CP + SIT + NAT) + SSA */
/* L: Logical segment # in volume, R: Relative segment # in main area */
#define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno)
#define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno)
#define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA)
#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE)
#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE && (t) <= CURSEG_COLD_NODE)
static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
unsigned short seg_type)
{
f2fs_bug_on(sbi, seg_type >= NR_PERSISTENT_LOG);
}
#define IS_HOT(t) ((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA)
#define IS_WARM(t) ((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA)
@ -34,7 +41,9 @@
((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \
((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \
((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \
((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno) || \
((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno) || \
((seg) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno))
#define IS_CURSEC(sbi, secno) \
(((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \
@ -48,7 +57,11 @@
((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \
(sbi)->segs_per_sec) || \
((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \
(sbi)->segs_per_sec)) \
(sbi)->segs_per_sec) || \
((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno / \
(sbi)->segs_per_sec) || \
((secno) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno / \
(sbi)->segs_per_sec))
#define MAIN_BLKADDR(sbi) \
(SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \
@ -129,23 +142,28 @@ enum {
};
/*
* In the victim_sel_policy->alloc_mode, there are two block allocation modes.
* In the victim_sel_policy->alloc_mode, there are three block allocation modes.
* LFS writes data sequentially with cleaning operations.
* SSR (Slack Space Recycle) reuses obsolete space without cleaning operations.
* AT_SSR (Age Threshold based Slack Space Recycle) merges fragments into
* fragmented segment which has similar aging degree.
*/
enum {
LFS = 0,
SSR
SSR,
AT_SSR,
};
/*
* In the victim_sel_policy->gc_mode, there are two gc, aka cleaning, modes.
* In the victim_sel_policy->gc_mode, there are three gc, aka cleaning, modes.
* GC_CB is based on cost-benefit algorithm.
* GC_GREEDY is based on greedy algorithm.
* GC_AT is based on age-threshold algorithm.
*/
enum {
GC_CB = 0,
GC_GREEDY,
GC_AT,
ALLOC_NEXT,
FLUSH_DEVICE,
MAX_GC_POLICY,
@ -154,24 +172,28 @@ enum {
/*
* BG_GC means the background cleaning job.
* FG_GC means the on-demand cleaning job.
* FORCE_FG_GC means on-demand cleaning job in background.
*/
enum {
BG_GC = 0,
FG_GC,
FORCE_FG_GC,
};
/* for a function parameter to select a victim segment */
struct victim_sel_policy {
int alloc_mode; /* LFS or SSR */
int gc_mode; /* GC_CB or GC_GREEDY */
unsigned long *dirty_segmap; /* dirty segment bitmap */
unsigned int max_search; /* maximum # of segments to search */
unsigned long *dirty_bitmap; /* dirty segment/section bitmap */
unsigned int max_search; /*
* maximum # of segments/sections
* to search
*/
unsigned int offset; /* last scanned bitmap offset */
unsigned int ofs_unit; /* bitmap search unit */
unsigned int min_cost; /* minimum cost */
unsigned long long oldest_age; /* oldest age of segments having the same min cost */
unsigned int min_segno; /* segment # having min. cost */
unsigned long long age; /* mtime of GCed section*/
unsigned long long age_threshold;/* age threshold */
};
struct seg_entry {
@ -184,7 +206,7 @@ struct seg_entry {
unsigned char *cur_valid_map_mir; /* mirror of current valid bitmap */
#endif
/*
* # of valid blocks and the validity bitmap stored in the the last
* # of valid blocks and the validity bitmap stored in the last
* checkpoint pack. This information is used by the SSR mode.
*/
unsigned char *ckpt_valid_map; /* validity bitmap of blocks last cp */
@ -237,6 +259,8 @@ struct sit_info {
unsigned long long mounted_time; /* mount time */
unsigned long long min_mtime; /* min. modification time */
unsigned long long max_mtime; /* max. modification time */
unsigned long long dirty_min_mtime; /* rerange candidates in GC_AT */
unsigned long long dirty_max_mtime; /* rerange candidates in GC_AT */
unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */
};
@ -266,6 +290,7 @@ enum dirty_type {
struct dirty_seglist_info {
const struct victim_selection *v_ops; /* victim selction operation */
unsigned long *dirty_segmap[NR_DIRTY_TYPE];
unsigned long *dirty_secmap;
struct mutex seglist_lock; /* lock for segment bitmaps */
int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */
unsigned long *victim_secmap; /* background GC victims */
@ -274,7 +299,7 @@ struct dirty_seglist_info {
/* victim selection function for cleaning and SSR */
struct victim_selection {
int (*get_victim)(struct f2fs_sb_info *, unsigned int *,
int, int, char);
int, int, char, unsigned long long);
};
/* for active log information */
@ -284,10 +309,13 @@ struct curseg_info {
struct rw_semaphore journal_rwsem; /* protect journal area */
struct f2fs_journal *journal; /* cached journal info */
unsigned char alloc_type; /* current allocation type */
unsigned short seg_type; /* segment type like CURSEG_XXX_TYPE */
unsigned int segno; /* current segment number */
unsigned short next_blkoff; /* next block offset to write */
unsigned int zone; /* current zone number */
unsigned int next_segno; /* preallocated segment */
int fragment_remained_chunk; /* remained block size in a chunk for block fragmentation mode */
bool inited; /* indicate inmem log is inited */
};
struct sit_entry_set {
@ -301,8 +329,6 @@ struct sit_entry_set {
*/
static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type)
{
if (type == CURSEG_COLD_DATA_PINNED)
type = CURSEG_COLD_DATA;
return (struct curseg_info *)(SM_I(sbi)->curseg_array + type);
}
@ -334,8 +360,20 @@ static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi,
}
static inline unsigned int get_ckpt_valid_blocks(struct f2fs_sb_info *sbi,
unsigned int segno)
unsigned int segno, bool use_section)
{
if (use_section && __is_large_section(sbi)) {
unsigned int start_segno = START_SEGNO(segno);
unsigned int blocks = 0;
int i;
for (i = 0; i < sbi->segs_per_sec; i++, start_segno++) {
struct seg_entry *se = get_seg_entry(sbi, start_segno);
blocks += se->ckpt_valid_blocks;
}
return blocks;
}
return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
}
@ -407,6 +445,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno);
unsigned int next;
unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi, segno);
spin_lock(&free_i->segmap_lock);
clear_bit(segno, free_i->free_segmap);
@ -414,7 +453,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
next = find_next_bit(free_i->free_segmap,
start_segno + sbi->segs_per_sec, start_segno);
if (next >= start_segno + sbi->segs_per_sec) {
if (next >= start_segno + usable_segs) {
clear_bit(secno, free_i->free_secmap);
free_i->free_sections++;
}
@ -434,22 +473,23 @@ static inline void __set_inuse(struct f2fs_sb_info *sbi,
}
static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
unsigned int segno)
unsigned int segno, bool inmem)
{
struct free_segmap_info *free_i = FREE_I(sbi);
unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno);
unsigned int next;
unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi, segno);
spin_lock(&free_i->segmap_lock);
if (test_and_clear_bit(segno, free_i->free_segmap)) {
free_i->free_segments++;
if (IS_CURSEC(sbi, secno))
if (!inmem && IS_CURSEC(sbi, secno))
goto skip_free;
next = find_next_bit(free_i->free_segmap,
start_segno + sbi->segs_per_sec, start_segno);
if (next >= start_segno + sbi->segs_per_sec) {
if (next >= start_segno + usable_segs) {
if (test_and_clear_bit(secno, free_i->free_secmap))
free_i->free_sections++;
}
@ -496,9 +536,10 @@ static inline unsigned int free_segments(struct f2fs_sb_info *sbi)
return FREE_I(sbi)->free_segments;
}
static inline int reserved_segments(struct f2fs_sb_info *sbi)
static inline unsigned int reserved_segments(struct f2fs_sb_info *sbi)
{
return SM_I(sbi)->reserved_segments;
return SM_I(sbi)->reserved_segments +
SM_I(sbi)->additional_reserved_segments;
}
static inline unsigned int free_sections(struct f2fs_sb_info *sbi)
@ -528,7 +569,7 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi)
static inline int reserved_sections(struct f2fs_sb_info *sbi)
{
return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi));
return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi));
}
static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi)
@ -542,8 +583,8 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi)
/* check current node segment */
for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) {
segno = CURSEG_I(sbi, i)->segno;
left_blocks = sbi->blocks_per_seg -
get_seg_entry(sbi, segno)->ckpt_valid_blocks;
left_blocks = f2fs_usable_blks_in_seg(sbi, segno) -
get_seg_entry(sbi, segno)->ckpt_valid_blocks;
if (node_blocks > left_blocks)
return false;
@ -551,7 +592,7 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi)
/* check current data segment */
segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno;
left_blocks = sbi->blocks_per_seg -
left_blocks = f2fs_usable_blks_in_seg(sbi, segno) -
get_seg_entry(sbi, segno)->ckpt_valid_blocks;
if (dent_blocks > left_blocks)
return false;
@ -610,7 +651,9 @@ static inline int utilization(struct f2fs_sb_info *sbi)
* pages over min_fsync_blocks. (=default option)
* F2FS_IPU_ASYNC - do IPU given by asynchronous write requests.
* F2FS_IPU_NOCACHE - disable IPU bio cache.
* F2FS_IPUT_DISABLE - disable IPU. (=default option in LFS mode)
* F2FS_IPU_HONOR_OPU_WRITE - use OPU write prior to IPU write if inode has
* FI_OPU_WRITE flag.
* F2FS_IPU_DISABLE - disable IPU. (=default option in LFS mode)
*/
#define DEF_MIN_IPU_UTIL 70
#define DEF_MIN_FSYNC_BLOCKS 20
@ -626,6 +669,7 @@ enum {
F2FS_IPU_FSYNC,
F2FS_IPU_ASYNC,
F2FS_IPU_NOCACHE,
F2FS_IPU_HONOR_OPU_WRITE,
};
static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi,
@ -673,21 +717,22 @@ static inline int check_block_count(struct f2fs_sb_info *sbi,
bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false;
int valid_blocks = 0;
int cur_pos = 0, next_pos;
unsigned int usable_blks_per_seg = f2fs_usable_blks_in_seg(sbi, segno);
/* check bitmap with valid block count */
do {
if (is_valid) {
next_pos = find_next_zero_bit_le(&raw_sit->valid_map,
sbi->blocks_per_seg,
usable_blks_per_seg,
cur_pos);
valid_blocks += next_pos - cur_pos;
} else
next_pos = find_next_bit_le(&raw_sit->valid_map,
sbi->blocks_per_seg,
usable_blks_per_seg,
cur_pos);
cur_pos = next_pos;
is_valid = !is_valid;
} while (cur_pos < sbi->blocks_per_seg);
} while (cur_pos < usable_blks_per_seg);
if (unlikely(GET_SIT_VBLOCKS(raw_sit) != valid_blocks)) {
f2fs_err(sbi, "Mismatch valid blocks %d vs. %d",
@ -696,8 +741,13 @@ static inline int check_block_count(struct f2fs_sb_info *sbi,
return -EFSCORRUPTED;
}
if (usable_blks_per_seg < sbi->blocks_per_seg)
f2fs_bug_on(sbi, find_next_bit_le(&raw_sit->valid_map,
sbi->blocks_per_seg,
usable_blks_per_seg) != sbi->blocks_per_seg);
/* check segment usage, and check boundary of a given segment number */
if (unlikely(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg
if (unlikely(GET_SIT_VBLOCKS(raw_sit) > usable_blks_per_seg
|| segno > TOTAL_SEGS(sbi) - 1)) {
f2fs_err(sbi, "Wrong valid blocks %d or segno %u",
GET_SIT_VBLOCKS(raw_sit), segno);

View File

@ -18,9 +18,7 @@ static unsigned int shrinker_run_no;
static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi)
{
long count = NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt;
return count > 0 ? count : 0;
return NM_I(sbi)->nat_cnt[RECLAIMABLE_NAT];
}
static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)

File diff suppressed because it is too large Load Diff

View File

@ -11,10 +11,13 @@
#include <linux/f2fs_fs.h>
#include <linux/seq_file.h>
#include <linux/unicode.h>
#include <linux/ioprio.h>
#include <linux/sysfs.h>
#include "f2fs.h"
#include "segment.h"
#include "gc.h"
#include "iostat.h"
#include <trace/events/f2fs.h>
static struct proc_dir_entry *f2fs_proc_root;
@ -27,13 +30,25 @@ enum {
NM_INFO, /* struct f2fs_nm_info */
F2FS_SBI, /* struct f2fs_sb_info */
#ifdef CONFIG_F2FS_STAT_FS
STAT_INFO, /* struct f2fs_stat_info */
STAT_INFO, /* struct f2fs_stat_info */
#endif
#ifdef CONFIG_F2FS_FAULT_INJECTION
FAULT_INFO_RATE, /* struct f2fs_fault_info */
FAULT_INFO_TYPE, /* struct f2fs_fault_info */
#endif
RESERVED_BLOCKS, /* struct f2fs_sb_info */
CPRC_INFO, /* struct ckpt_req_control */
ATGC_INFO, /* struct atgc_management */
};
static const char *gc_mode_names[MAX_GC_MODE] = {
"GC_NORMAL",
"GC_IDLE_CB",
"GC_IDLE_GREEDY",
"GC_IDLE_AT",
"GC_URGENT_HIGH",
"GC_URGENT_LOW",
"GC_URGENT_MID"
};
struct f2fs_attr {
@ -70,6 +85,10 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
else if (struct_type == STAT_INFO)
return (unsigned char *)F2FS_STAT(sbi);
#endif
else if (struct_type == CPRC_INFO)
return (unsigned char *)&sbi->cprc_info;
else if (struct_type == ATGC_INFO)
return (unsigned char *)&sbi->am;
return NULL;
}
@ -87,28 +106,42 @@ static ssize_t free_segments_show(struct f2fs_attr *a,
(unsigned long long)(free_segments(sbi)));
}
static ssize_t ovp_segments_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
return sprintf(buf, "%llu\n",
(unsigned long long)(overprovision_segments(sbi)));
}
static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
struct super_block *sb = sbi->sb;
if (!sb->s_bdev->bd_part)
return sprintf(buf, "0\n");
return sprintf(buf, "%llu\n",
(unsigned long long)(sbi->kbytes_written +
BD_PART_WRITTEN(sbi)));
((f2fs_get_sectors_written(sbi) -
sbi->sectors_written_start) >> 1)));
}
static ssize_t sb_status_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
return sprintf(buf, "%lx\n", sbi->s_flag);
}
static ssize_t pending_discard_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
if (!SM_I(sbi)->dcc_info)
return -EINVAL;
return sprintf(buf, "%llu\n", (unsigned long long)atomic_read(
&SM_I(sbi)->dcc_info->discard_cmd_cnt));
}
static ssize_t features_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
struct super_block *sb = sbi->sb;
int len = 0;
if (!sb->s_bdev->bd_part)
return sprintf(buf, "0\n");
if (f2fs_sb_has_encrypt(sbi))
len += scnprintf(buf, PAGE_SIZE - len, "%s",
"encryption");
@ -145,6 +178,9 @@ static ssize_t features_show(struct f2fs_attr *a,
if (f2fs_sb_has_casefold(sbi))
len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "casefold");
if (f2fs_sb_has_readonly(sbi))
len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "readonly");
if (f2fs_sb_has_compression(sbi))
len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "compression");
@ -225,6 +261,13 @@ static ssize_t avg_vblocks_show(struct f2fs_attr *a,
}
#endif
static ssize_t main_blkaddr_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
return snprintf(buf, PAGE_SIZE, "%llu\n",
(unsigned long long)MAIN_BLKADDR(sbi));
}
static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
@ -256,6 +299,50 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
return len;
}
if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) {
struct ckpt_req_control *cprc = &sbi->cprc_info;
int len = 0;
int class = IOPRIO_PRIO_CLASS(cprc->ckpt_thread_ioprio);
int data = IOPRIO_PRIO_DATA(cprc->ckpt_thread_ioprio);
if (class == IOPRIO_CLASS_RT)
len += scnprintf(buf + len, PAGE_SIZE - len, "rt,");
else if (class == IOPRIO_CLASS_BE)
len += scnprintf(buf + len, PAGE_SIZE - len, "be,");
else
return -EINVAL;
len += scnprintf(buf + len, PAGE_SIZE - len, "%d\n", data);
return len;
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (!strcmp(a->attr.name, "compr_written_block"))
return snprintf(buf, PAGE_SIZE, "%llu\n",
sbi->compr_written_block);
if (!strcmp(a->attr.name, "compr_saved_block"))
return snprintf(buf, PAGE_SIZE, "%llu\n",
sbi->compr_saved_block);
if (!strcmp(a->attr.name, "compr_new_inode"))
return snprintf(buf, PAGE_SIZE, "%u\n",
sbi->compr_new_inode);
#endif
if (!strcmp(a->attr.name, "gc_urgent"))
return snprintf(buf, PAGE_SIZE, "%s\n",
gc_mode_names[sbi->gc_mode]);
if (!strcmp(a->attr.name, "gc_segment_mode"))
return snprintf(buf, PAGE_SIZE, "%s\n",
gc_mode_names[sbi->gc_segment_mode]);
if (!strcmp(a->attr.name, "gc_reclaimed_segments")) {
return snprintf(buf, PAGE_SIZE, "%u\n",
sbi->gc_reclaimed_segs[sbi->gc_segment_mode]);
}
ui = (unsigned int *)(ptr + a->offset);
return sprintf(buf, "%u\n", *ui);
@ -292,10 +379,10 @@ static ssize_t __sbi_store(struct f2fs_attr *a,
set = false;
}
if (strlen(name) >= F2FS_EXTENSION_LEN)
if (!strlen(name) || strlen(name) >= F2FS_EXTENSION_LEN)
return -EINVAL;
down_write(&sbi->sb_lock);
f2fs_down_write(&sbi->sb_lock);
ret = f2fs_update_extension_list(sbi, name, hot, set);
if (ret)
@ -305,10 +392,42 @@ static ssize_t __sbi_store(struct f2fs_attr *a,
if (ret)
f2fs_update_extension_list(sbi, name, hot, !set);
out:
up_write(&sbi->sb_lock);
f2fs_up_write(&sbi->sb_lock);
return ret ? ret : count;
}
if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) {
const char *name = strim((char *)buf);
struct ckpt_req_control *cprc = &sbi->cprc_info;
int class;
long data;
int ret;
if (!strncmp(name, "rt,", 3))
class = IOPRIO_CLASS_RT;
else if (!strncmp(name, "be,", 3))
class = IOPRIO_CLASS_BE;
else
return -EINVAL;
name += 3;
ret = kstrtol(name, 10, &data);
if (ret)
return ret;
if (data >= IOPRIO_BE_NR || data < 0)
return -EINVAL;
cprc->ckpt_thread_ioprio = IOPRIO_PRIO_VALUE(class, data);
if (test_opt(sbi, MERGE_CHECKPOINT)) {
ret = set_task_ioprio(cprc->f2fs_issue_ckpt,
cprc->ckpt_thread_ioprio);
if (ret)
return ret;
}
return count;
}
ui = (unsigned int *)(ptr + a->offset);
ret = kstrtoul(skip_spaces(buf), 0, &t);
@ -323,7 +442,9 @@ out:
if (a->struct_type == RESERVED_BLOCKS) {
spin_lock(&sbi->stat_lock);
if (t > (unsigned long)(sbi->user_block_count -
F2FS_OPTION(sbi).root_reserved_blocks)) {
F2FS_OPTION(sbi).root_reserved_blocks -
sbi->blocks_per_seg *
SM_I(sbi)->additional_reserved_segments)) {
spin_unlock(&sbi->stat_lock);
return -EINVAL;
}
@ -337,6 +458,8 @@ out:
if (!strcmp(a->attr.name, "discard_granularity")) {
if (t == 0 || t > MAX_PLIST_NUM)
return -EINVAL;
if (!f2fs_block_unit_discard(sbi))
return -EINVAL;
if (t == *ui)
return count;
*ui = t;
@ -352,29 +475,55 @@ out:
return -EINVAL;
if (!strcmp(a->attr.name, "gc_urgent")) {
if (t >= 1) {
sbi->gc_mode = GC_URGENT;
if (t == 0) {
sbi->gc_mode = GC_NORMAL;
} else if (t == 1) {
sbi->gc_mode = GC_URGENT_HIGH;
if (sbi->gc_thread) {
sbi->gc_thread->gc_wake = 1;
wake_up_interruptible_all(
&sbi->gc_thread->gc_wait_queue_head);
wake_up_discard_thread(sbi, true);
}
} else if (t == 2) {
sbi->gc_mode = GC_URGENT_LOW;
} else if (t == 3) {
sbi->gc_mode = GC_URGENT_MID;
if (sbi->gc_thread) {
sbi->gc_thread->gc_wake = 1;
wake_up_interruptible_all(
&sbi->gc_thread->gc_wait_queue_head);
}
} else {
return -EINVAL;
}
return count;
}
if (!strcmp(a->attr.name, "gc_idle")) {
if (t == GC_IDLE_CB) {
sbi->gc_mode = GC_IDLE_CB;
} else if (t == GC_IDLE_GREEDY) {
sbi->gc_mode = GC_IDLE_GREEDY;
} else if (t == GC_IDLE_AT) {
if (!sbi->am.atgc_enabled)
return -EINVAL;
sbi->gc_mode = GC_IDLE_AT;
} else {
sbi->gc_mode = GC_NORMAL;
}
return count;
}
if (!strcmp(a->attr.name, "gc_idle")) {
if (t == GC_IDLE_CB)
sbi->gc_mode = GC_IDLE_CB;
else if (t == GC_IDLE_GREEDY)
sbi->gc_mode = GC_IDLE_GREEDY;
else
sbi->gc_mode = GC_NORMAL;
if (!strcmp(a->attr.name, "gc_urgent_high_remaining")) {
spin_lock(&sbi->gc_urgent_high_lock);
sbi->gc_urgent_high_limited = t != 0;
sbi->gc_urgent_high_remaining = t;
spin_unlock(&sbi->gc_urgent_high_lock);
return count;
}
#ifdef CONFIG_F2FS_IOSTAT
if (!strcmp(a->attr.name, "iostat_enable")) {
sbi->iostat_enable = !!t;
if (!sbi->iostat_enable)
@ -390,6 +539,70 @@ out:
spin_unlock(&sbi->iostat_lock);
return count;
}
#endif
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (!strcmp(a->attr.name, "compr_written_block") ||
!strcmp(a->attr.name, "compr_saved_block")) {
if (t != 0)
return -EINVAL;
sbi->compr_written_block = 0;
sbi->compr_saved_block = 0;
return count;
}
if (!strcmp(a->attr.name, "compr_new_inode")) {
if (t != 0)
return -EINVAL;
sbi->compr_new_inode = 0;
return count;
}
#endif
if (!strcmp(a->attr.name, "atgc_candidate_ratio")) {
if (t > 100)
return -EINVAL;
sbi->am.candidate_ratio = t;
return count;
}
if (!strcmp(a->attr.name, "atgc_age_weight")) {
if (t > 100)
return -EINVAL;
sbi->am.age_weight = t;
return count;
}
if (!strcmp(a->attr.name, "gc_segment_mode")) {
if (t < MAX_GC_MODE)
sbi->gc_segment_mode = t;
else
return -EINVAL;
return count;
}
if (!strcmp(a->attr.name, "gc_reclaimed_segments")) {
if (t != 0)
return -EINVAL;
sbi->gc_reclaimed_segs[sbi->gc_segment_mode] = 0;
return count;
}
if (!strcmp(a->attr.name, "max_fragment_chunk")) {
if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE)
sbi->max_fragment_chunk = t;
else
return -EINVAL;
return count;
}
if (!strcmp(a->attr.name, "max_fragment_hole")) {
if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE)
sbi->max_fragment_hole = t;
else
return -EINVAL;
return count;
}
*ui = (unsigned int)t;
@ -442,46 +655,49 @@ static void f2fs_sb_release(struct kobject *kobj)
complete(&sbi->s_kobj_unregister);
}
enum feat_id {
FEAT_CRYPTO = 0,
FEAT_BLKZONED,
FEAT_ATOMIC_WRITE,
FEAT_EXTRA_ATTR,
FEAT_PROJECT_QUOTA,
FEAT_INODE_CHECKSUM,
FEAT_FLEXIBLE_INLINE_XATTR,
FEAT_QUOTA_INO,
FEAT_INODE_CRTIME,
FEAT_LOST_FOUND,
FEAT_VERITY,
FEAT_SB_CHECKSUM,
FEAT_CASEFOLD,
FEAT_COMPRESSION,
FEAT_TEST_DUMMY_ENCRYPTION_V2,
};
/*
* Note that there are three feature list entries:
* 1) /sys/fs/f2fs/features
* : shows runtime features supported by in-kernel f2fs along with Kconfig.
* - ref. F2FS_FEATURE_RO_ATTR()
*
* 2) /sys/fs/f2fs/$s_id/features <deprecated>
* : shows on-disk features enabled by mkfs.f2fs, used for old kernels. This
* won't add new feature anymore, and thus, users should check entries in 3)
* instead of this 2).
*
* 3) /sys/fs/f2fs/$s_id/feature_list
* : shows on-disk features enabled by mkfs.f2fs per instance, which follows
* sysfs entry rule where each entry should expose single value.
* This list covers old feature list provided by 2) and beyond. Therefore,
* please add new on-disk feature in this list only.
* - ref. F2FS_SB_FEATURE_RO_ATTR()
*/
static ssize_t f2fs_feature_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
switch (a->id) {
case FEAT_CRYPTO:
case FEAT_BLKZONED:
case FEAT_ATOMIC_WRITE:
case FEAT_EXTRA_ATTR:
case FEAT_PROJECT_QUOTA:
case FEAT_INODE_CHECKSUM:
case FEAT_FLEXIBLE_INLINE_XATTR:
case FEAT_QUOTA_INO:
case FEAT_INODE_CRTIME:
case FEAT_LOST_FOUND:
case FEAT_VERITY:
case FEAT_SB_CHECKSUM:
case FEAT_CASEFOLD:
case FEAT_COMPRESSION:
case FEAT_TEST_DUMMY_ENCRYPTION_V2:
return sprintf(buf, "supported\n");
}
#define F2FS_FEATURE_RO_ATTR(_name) \
static struct f2fs_attr f2fs_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = 0444 }, \
.show = f2fs_feature_show, \
}
static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
if (F2FS_HAS_FEATURE(sbi, a->id))
return sprintf(buf, "supported\n");
}
return 0;
return sprintf(buf, "unsupported\n");
}
#define F2FS_SB_FEATURE_RO_ATTR(_name, _feat) \
static struct f2fs_attr f2fs_attr_sb_##_name = { \
.attr = {.name = __stringify(_name), .mode = 0444 }, \
.show = f2fs_sb_feature_show, \
.id = F2FS_FEATURE_##_feat, \
}
#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \
@ -501,13 +717,6 @@ static struct f2fs_attr f2fs_attr_##_name = { \
#define F2FS_GENERAL_RO_ATTR(name) \
static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL)
#define F2FS_FEATURE_RO_ATTR(_name, _id) \
static struct f2fs_attr f2fs_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = 0444 }, \
.show = f2fs_feature_show, \
.id = _id, \
}
#define F2FS_STAT_ATTR(_struct_type, _struct_name, _name, _elname) \
static struct f2fs_attr f2fs_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = 0444 }, \
@ -524,8 +733,11 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle, gc_mode);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent, gc_mode);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, main_blkaddr, main_blkaddr);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_request, max_discard_request);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_issue_time);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity);
F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
@ -538,6 +750,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, max_roll_forward_node_blocks, max_rf_node_blocks);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, migration_granularity, migration_granularity);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
@ -548,9 +761,12 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, discard_idle_interval,
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info,
umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]);
#ifdef CONFIG_F2FS_IOSTAT
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_period_ms, iostat_period_ms);
#endif
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_io_bytes, max_io_bytes);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold);
F2FS_RW_ATTR(F2FS_SBI, f2fs_super_block, extension_list, extension_list);
#ifdef CONFIG_F2FS_FAULT_INJECTION
@ -559,14 +775,19 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
#endif
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent_high_remaining, gc_urgent_high_remaining);
F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio);
F2FS_GENERAL_RO_ATTR(dirty_segments);
F2FS_GENERAL_RO_ATTR(free_segments);
F2FS_GENERAL_RO_ATTR(ovp_segments);
F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
F2FS_GENERAL_RO_ATTR(features);
F2FS_GENERAL_RO_ATTR(current_reserved_blocks);
F2FS_GENERAL_RO_ATTR(unusable);
F2FS_GENERAL_RO_ATTR(encoding);
F2FS_GENERAL_RO_ATTR(mounted_time_sec);
F2FS_GENERAL_RO_ATTR(main_blkaddr);
F2FS_GENERAL_RO_ATTR(pending_discard);
#ifdef CONFIG_F2FS_STAT_FS
F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count);
F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count);
@ -578,28 +799,49 @@ F2FS_GENERAL_RO_ATTR(avg_vblocks);
#endif
#ifdef CONFIG_FS_ENCRYPTION
F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO);
F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2, FEAT_TEST_DUMMY_ENCRYPTION_V2);
F2FS_FEATURE_RO_ATTR(encryption);
F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2);
#ifdef CONFIG_UNICODE
F2FS_FEATURE_RO_ATTR(encrypted_casefold);
#endif
#endif /* CONFIG_FS_ENCRYPTION */
#ifdef CONFIG_BLK_DEV_ZONED
F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED);
F2FS_FEATURE_RO_ATTR(block_zoned);
#endif
F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE);
F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR);
F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA);
F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM);
F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR);
F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO);
F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME);
F2FS_FEATURE_RO_ATTR(lost_found, FEAT_LOST_FOUND);
F2FS_FEATURE_RO_ATTR(atomic_write);
F2FS_FEATURE_RO_ATTR(extra_attr);
F2FS_FEATURE_RO_ATTR(project_quota);
F2FS_FEATURE_RO_ATTR(inode_checksum);
F2FS_FEATURE_RO_ATTR(flexible_inline_xattr);
F2FS_FEATURE_RO_ATTR(quota_ino);
F2FS_FEATURE_RO_ATTR(inode_crtime);
F2FS_FEATURE_RO_ATTR(lost_found);
#ifdef CONFIG_FS_VERITY
F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY);
F2FS_FEATURE_RO_ATTR(verity);
#endif
F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM);
F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD);
F2FS_FEATURE_RO_ATTR(sb_checksum);
#ifdef CONFIG_UNICODE
F2FS_FEATURE_RO_ATTR(casefold);
#endif
F2FS_FEATURE_RO_ATTR(readonly);
#ifdef CONFIG_F2FS_FS_COMPRESSION
F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION);
F2FS_FEATURE_RO_ATTR(compression);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_written_block, compr_written_block);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_saved_block, compr_saved_block);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_new_inode, compr_new_inode);
#endif
F2FS_FEATURE_RO_ATTR(pin_file);
/* For ATGC */
F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_ratio, candidate_ratio);
F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_count, max_candidate_count);
F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_weight, age_weight);
F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_threshold, age_threshold);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_segment_mode, gc_segment_mode);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_reclaimed_segments, gc_reclaimed_segs);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_chunk, max_fragment_chunk);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_hole, max_fragment_hole);
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
@ -612,7 +854,12 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(reclaim_segments),
ATTR_LIST(main_blkaddr),
ATTR_LIST(max_small_discards),
ATTR_LIST(max_discard_request),
ATTR_LIST(min_discard_issue_time),
ATTR_LIST(mid_discard_issue_time),
ATTR_LIST(max_discard_issue_time),
ATTR_LIST(discard_granularity),
ATTR_LIST(pending_discard),
ATTR_LIST(batched_trim_sections),
ATTR_LIST(ipu_policy),
ATTR_LIST(min_ipu_util),
@ -626,14 +873,18 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(ram_thresh),
ATTR_LIST(ra_nid_pages),
ATTR_LIST(dirty_nats_ratio),
ATTR_LIST(max_roll_forward_node_blocks),
ATTR_LIST(cp_interval),
ATTR_LIST(idle_interval),
ATTR_LIST(discard_idle_interval),
ATTR_LIST(gc_idle_interval),
ATTR_LIST(umount_discard_timeout),
#ifdef CONFIG_F2FS_IOSTAT
ATTR_LIST(iostat_enable),
ATTR_LIST(iostat_period_ms),
#endif
ATTR_LIST(readdir_ra),
ATTR_LIST(max_io_bytes),
ATTR_LIST(gc_pin_file_thresh),
ATTR_LIST(extension_list),
#ifdef CONFIG_F2FS_FAULT_INJECTION
@ -642,8 +893,11 @@ static struct attribute *f2fs_attrs[] = {
#endif
ATTR_LIST(data_io_flag),
ATTR_LIST(node_io_flag),
ATTR_LIST(gc_urgent_high_remaining),
ATTR_LIST(ckpt_thread_ioprio),
ATTR_LIST(dirty_segments),
ATTR_LIST(free_segments),
ATTR_LIST(ovp_segments),
ATTR_LIST(unusable),
ATTR_LIST(lifetime_write_kbytes),
ATTR_LIST(features),
@ -660,6 +914,20 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(moved_blocks_background),
ATTR_LIST(avg_vblocks),
#endif
#ifdef CONFIG_F2FS_FS_COMPRESSION
ATTR_LIST(compr_written_block),
ATTR_LIST(compr_saved_block),
ATTR_LIST(compr_new_inode),
#endif
/* For ATGC */
ATTR_LIST(atgc_candidate_ratio),
ATTR_LIST(atgc_candidate_count),
ATTR_LIST(atgc_age_weight),
ATTR_LIST(atgc_age_threshold),
ATTR_LIST(gc_segment_mode),
ATTR_LIST(gc_reclaimed_segments),
ATTR_LIST(max_fragment_chunk),
ATTR_LIST(max_fragment_hole),
NULL,
};
@ -667,7 +935,10 @@ static struct attribute *f2fs_feat_attrs[] = {
#ifdef CONFIG_FS_ENCRYPTION
ATTR_LIST(encryption),
ATTR_LIST(test_dummy_encryption_v2),
#ifdef CONFIG_UNICODE
ATTR_LIST(encrypted_casefold),
#endif
#endif /* CONFIG_FS_ENCRYPTION */
#ifdef CONFIG_BLK_DEV_ZONED
ATTR_LIST(block_zoned),
#endif
@ -683,10 +954,53 @@ static struct attribute *f2fs_feat_attrs[] = {
ATTR_LIST(verity),
#endif
ATTR_LIST(sb_checksum),
#ifdef CONFIG_UNICODE
ATTR_LIST(casefold),
#endif
ATTR_LIST(readonly),
#ifdef CONFIG_F2FS_FS_COMPRESSION
ATTR_LIST(compression),
#endif
ATTR_LIST(pin_file),
NULL,
};
F2FS_GENERAL_RO_ATTR(sb_status);
static struct attribute *f2fs_stat_attrs[] = {
ATTR_LIST(sb_status),
NULL,
};
F2FS_SB_FEATURE_RO_ATTR(encryption, ENCRYPT);
F2FS_SB_FEATURE_RO_ATTR(block_zoned, BLKZONED);
F2FS_SB_FEATURE_RO_ATTR(extra_attr, EXTRA_ATTR);
F2FS_SB_FEATURE_RO_ATTR(project_quota, PRJQUOTA);
F2FS_SB_FEATURE_RO_ATTR(inode_checksum, INODE_CHKSUM);
F2FS_SB_FEATURE_RO_ATTR(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR);
F2FS_SB_FEATURE_RO_ATTR(quota_ino, QUOTA_INO);
F2FS_SB_FEATURE_RO_ATTR(inode_crtime, INODE_CRTIME);
F2FS_SB_FEATURE_RO_ATTR(lost_found, LOST_FOUND);
F2FS_SB_FEATURE_RO_ATTR(verity, VERITY);
F2FS_SB_FEATURE_RO_ATTR(sb_checksum, SB_CHKSUM);
F2FS_SB_FEATURE_RO_ATTR(casefold, CASEFOLD);
F2FS_SB_FEATURE_RO_ATTR(compression, COMPRESSION);
F2FS_SB_FEATURE_RO_ATTR(readonly, RO);
static struct attribute *f2fs_sb_feat_attrs[] = {
ATTR_LIST(sb_encryption),
ATTR_LIST(sb_block_zoned),
ATTR_LIST(sb_extra_attr),
ATTR_LIST(sb_project_quota),
ATTR_LIST(sb_inode_checksum),
ATTR_LIST(sb_flexible_inline_xattr),
ATTR_LIST(sb_quota_ino),
ATTR_LIST(sb_inode_crtime),
ATTR_LIST(sb_lost_found),
ATTR_LIST(sb_verity),
ATTR_LIST(sb_sb_checksum),
ATTR_LIST(sb_casefold),
ATTR_LIST(sb_compression),
ATTR_LIST(sb_readonly),
NULL,
};
@ -706,7 +1020,7 @@ static struct kobj_type f2fs_ktype = {
};
static struct kset f2fs_kset = {
.kobj = {.ktype = &f2fs_ktype},
.kobj = {.ktype = &f2fs_ktype},
};
static struct kobj_type f2fs_feat_ktype = {
@ -718,6 +1032,71 @@ static struct kobject f2fs_feat = {
.kset = &f2fs_kset,
};
static ssize_t f2fs_stat_attr_show(struct kobject *kobj,
struct attribute *attr, char *buf)
{
struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
s_stat_kobj);
struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
return a->show ? a->show(a, sbi, buf) : 0;
}
static ssize_t f2fs_stat_attr_store(struct kobject *kobj, struct attribute *attr,
const char *buf, size_t len)
{
struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
s_stat_kobj);
struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
return a->store ? a->store(a, sbi, buf, len) : 0;
}
static void f2fs_stat_kobj_release(struct kobject *kobj)
{
struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
s_stat_kobj);
complete(&sbi->s_stat_kobj_unregister);
}
static const struct sysfs_ops f2fs_stat_attr_ops = {
.show = f2fs_stat_attr_show,
.store = f2fs_stat_attr_store,
};
static struct kobj_type f2fs_stat_ktype = {
.default_attrs = f2fs_stat_attrs,
.sysfs_ops = &f2fs_stat_attr_ops,
.release = f2fs_stat_kobj_release,
};
static ssize_t f2fs_sb_feat_attr_show(struct kobject *kobj,
struct attribute *attr, char *buf)
{
struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
s_feature_list_kobj);
struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
return a->show ? a->show(a, sbi, buf) : 0;
}
static void f2fs_feature_list_kobj_release(struct kobject *kobj)
{
struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
s_feature_list_kobj);
complete(&sbi->s_feature_list_kobj_unregister);
}
static const struct sysfs_ops f2fs_feature_list_attr_ops = {
.show = f2fs_sb_feat_attr_show,
};
static struct kobj_type f2fs_feature_list_ktype = {
.default_attrs = f2fs_sb_feat_attrs,
.sysfs_ops = &f2fs_feature_list_attr_ops,
.release = f2fs_feature_list_kobj_release,
};
static int __maybe_unused segment_info_seq_show(struct seq_file *seq,
void *offset)
{
@ -769,101 +1148,6 @@ static int __maybe_unused segment_bits_seq_show(struct seq_file *seq,
return 0;
}
void f2fs_record_iostat(struct f2fs_sb_info *sbi)
{
unsigned long long iostat_diff[NR_IO_TYPE];
int i;
if (time_is_after_jiffies(sbi->iostat_next_period))
return;
/* Need double check under the lock */
spin_lock(&sbi->iostat_lock);
if (time_is_after_jiffies(sbi->iostat_next_period)) {
spin_unlock(&sbi->iostat_lock);
return;
}
sbi->iostat_next_period = jiffies +
msecs_to_jiffies(sbi->iostat_period_ms);
for (i = 0; i < NR_IO_TYPE; i++) {
iostat_diff[i] = sbi->rw_iostat[i] -
sbi->prev_rw_iostat[i];
sbi->prev_rw_iostat[i] = sbi->rw_iostat[i];
}
spin_unlock(&sbi->iostat_lock);
trace_f2fs_iostat(sbi, iostat_diff);
}
static int __maybe_unused iostat_info_seq_show(struct seq_file *seq,
void *offset)
{
struct super_block *sb = seq->private;
struct f2fs_sb_info *sbi = F2FS_SB(sb);
time64_t now = ktime_get_real_seconds();
if (!sbi->iostat_enable)
return 0;
seq_printf(seq, "time: %-16llu\n", now);
/* print app write IOs */
seq_puts(seq, "[WRITE]\n");
seq_printf(seq, "app buffered: %-16llu\n",
sbi->rw_iostat[APP_BUFFERED_IO]);
seq_printf(seq, "app direct: %-16llu\n",
sbi->rw_iostat[APP_DIRECT_IO]);
seq_printf(seq, "app mapped: %-16llu\n",
sbi->rw_iostat[APP_MAPPED_IO]);
/* print fs write IOs */
seq_printf(seq, "fs data: %-16llu\n",
sbi->rw_iostat[FS_DATA_IO]);
seq_printf(seq, "fs node: %-16llu\n",
sbi->rw_iostat[FS_NODE_IO]);
seq_printf(seq, "fs meta: %-16llu\n",
sbi->rw_iostat[FS_META_IO]);
seq_printf(seq, "fs gc data: %-16llu\n",
sbi->rw_iostat[FS_GC_DATA_IO]);
seq_printf(seq, "fs gc node: %-16llu\n",
sbi->rw_iostat[FS_GC_NODE_IO]);
seq_printf(seq, "fs cp data: %-16llu\n",
sbi->rw_iostat[FS_CP_DATA_IO]);
seq_printf(seq, "fs cp node: %-16llu\n",
sbi->rw_iostat[FS_CP_NODE_IO]);
seq_printf(seq, "fs cp meta: %-16llu\n",
sbi->rw_iostat[FS_CP_META_IO]);
/* print app read IOs */
seq_puts(seq, "[READ]\n");
seq_printf(seq, "app buffered: %-16llu\n",
sbi->rw_iostat[APP_BUFFERED_READ_IO]);
seq_printf(seq, "app direct: %-16llu\n",
sbi->rw_iostat[APP_DIRECT_READ_IO]);
seq_printf(seq, "app mapped: %-16llu\n",
sbi->rw_iostat[APP_MAPPED_READ_IO]);
/* print fs read IOs */
seq_printf(seq, "fs data: %-16llu\n",
sbi->rw_iostat[FS_DATA_READ_IO]);
seq_printf(seq, "fs gc data: %-16llu\n",
sbi->rw_iostat[FS_GDATA_READ_IO]);
seq_printf(seq, "fs compr_data: %-16llu\n",
sbi->rw_iostat[FS_CDATA_READ_IO]);
seq_printf(seq, "fs node: %-16llu\n",
sbi->rw_iostat[FS_NODE_READ_IO]);
seq_printf(seq, "fs meta: %-16llu\n",
sbi->rw_iostat[FS_META_READ_IO]);
/* print other IOs */
seq_puts(seq, "[OTHER]\n");
seq_printf(seq, "fs discard: %-16llu\n",
sbi->rw_iostat[FS_DISCARD]);
return 0;
}
static int __maybe_unused victim_bits_seq_show(struct seq_file *seq,
void *offset)
{
@ -901,7 +1185,9 @@ static const struct file_operations f2fs_seq_##_name##_fops = { \
F2FS_PROC_FILE_DEF(segment_info);
F2FS_PROC_FILE_DEF(segment_bits);
#ifdef CONFIG_F2FS_IOSTAT
F2FS_PROC_FILE_DEF(iostat_info);
#endif
F2FS_PROC_FILE_DEF(victim_bits);
int __init f2fs_init_sysfs(void)
@ -942,37 +1228,71 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
init_completion(&sbi->s_kobj_unregister);
err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL,
"%s", sb->s_id);
if (err) {
kobject_put(&sbi->s_kobj);
wait_for_completion(&sbi->s_kobj_unregister);
return err;
}
if (err)
goto put_sb_kobj;
sbi->s_stat_kobj.kset = &f2fs_kset;
init_completion(&sbi->s_stat_kobj_unregister);
err = kobject_init_and_add(&sbi->s_stat_kobj, &f2fs_stat_ktype,
&sbi->s_kobj, "stat");
if (err)
goto put_stat_kobj;
sbi->s_feature_list_kobj.kset = &f2fs_kset;
init_completion(&sbi->s_feature_list_kobj_unregister);
err = kobject_init_and_add(&sbi->s_feature_list_kobj,
&f2fs_feature_list_ktype,
&sbi->s_kobj, "feature_list");
if (err)
goto put_feature_list_kobj;
if (f2fs_proc_root)
sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
if (sbi->s_proc) {
proc_create_data("segment_info", S_IRUGO, sbi->s_proc,
&f2fs_seq_segment_info_fops, sb);
proc_create_data("segment_bits", S_IRUGO, sbi->s_proc,
&f2fs_seq_segment_bits_fops, sb);
proc_create_data("iostat_info", S_IRUGO, sbi->s_proc,
proc_create_data("segment_info", 0444, sbi->s_proc,
&f2fs_seq_segment_info_fops, sb);
proc_create_data("segment_bits", 0444, sbi->s_proc,
&f2fs_seq_segment_bits_fops, sb);
#ifdef CONFIG_F2FS_IOSTAT
proc_create_data("iostat_info", 0444, sbi->s_proc,
&f2fs_seq_iostat_info_fops, sb);
proc_create_data("victim_bits", S_IRUGO, sbi->s_proc,
#endif
proc_create_data("victim_bits", 0444, sbi->s_proc,
&f2fs_seq_victim_bits_fops, sb);
}
return 0;
put_feature_list_kobj:
kobject_put(&sbi->s_feature_list_kobj);
wait_for_completion(&sbi->s_feature_list_kobj_unregister);
put_stat_kobj:
kobject_put(&sbi->s_stat_kobj);
wait_for_completion(&sbi->s_stat_kobj_unregister);
put_sb_kobj:
kobject_put(&sbi->s_kobj);
wait_for_completion(&sbi->s_kobj_unregister);
return err;
}
void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi)
{
if (sbi->s_proc) {
#ifdef CONFIG_F2FS_IOSTAT
remove_proc_entry("iostat_info", sbi->s_proc);
#endif
remove_proc_entry("segment_info", sbi->s_proc);
remove_proc_entry("segment_bits", sbi->s_proc);
remove_proc_entry("victim_bits", sbi->s_proc);
remove_proc_entry(sbi->sb->s_id, f2fs_proc_root);
}
kobject_del(&sbi->s_stat_kobj);
kobject_put(&sbi->s_stat_kobj);
wait_for_completion(&sbi->s_stat_kobj_unregister);
kobject_del(&sbi->s_feature_list_kobj);
kobject_put(&sbi->s_feature_list_kobj);
wait_for_completion(&sbi->s_feature_list_kobj_unregister);
kobject_del(&sbi->s_kobj);
kobject_put(&sbi->s_kobj);
wait_for_completion(&sbi->s_kobj_unregister);

View File

@ -1,165 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
/*
* f2fs IO tracer
*
* Copyright (c) 2014 Motorola Mobility
* Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
*/
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
#include <linux/sched.h>
#include <linux/radix-tree.h>
#include "f2fs.h"
#include "trace.h"
static RADIX_TREE(pids, GFP_ATOMIC);
static spinlock_t pids_lock;
static struct last_io_info last_io;
static inline void __print_last_io(void)
{
if (!last_io.len)
return;
trace_printk("%3x:%3x %4x %-16s %2x %5x %5x %12x %4x\n",
last_io.major, last_io.minor,
last_io.pid, "----------------",
last_io.type,
last_io.fio.op, last_io.fio.op_flags,
last_io.fio.new_blkaddr,
last_io.len);
memset(&last_io, 0, sizeof(last_io));
}
static int __file_type(struct inode *inode, pid_t pid)
{
if (f2fs_is_atomic_file(inode))
return __ATOMIC_FILE;
else if (f2fs_is_volatile_file(inode))
return __VOLATILE_FILE;
else if (S_ISDIR(inode->i_mode))
return __DIR_FILE;
else if (inode->i_ino == F2FS_NODE_INO(F2FS_I_SB(inode)))
return __NODE_FILE;
else if (inode->i_ino == F2FS_META_INO(F2FS_I_SB(inode)))
return __META_FILE;
else if (pid)
return __NORMAL_FILE;
else
return __MISC_FILE;
}
void f2fs_trace_pid(struct page *page)
{
struct inode *inode = page->mapping->host;
pid_t pid = task_pid_nr(current);
void *p;
set_page_private(page, (unsigned long)pid);
retry:
if (radix_tree_preload(GFP_NOFS))
return;
spin_lock(&pids_lock);
p = radix_tree_lookup(&pids, pid);
if (p == current)
goto out;
if (p)
radix_tree_delete(&pids, pid);
if (radix_tree_insert(&pids, pid, current)) {
spin_unlock(&pids_lock);
radix_tree_preload_end();
cond_resched();
goto retry;
}
trace_printk("%3x:%3x %4x %-16s\n",
MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
pid, current->comm);
out:
spin_unlock(&pids_lock);
radix_tree_preload_end();
}
void f2fs_trace_ios(struct f2fs_io_info *fio, int flush)
{
struct inode *inode;
pid_t pid;
int major, minor;
if (flush) {
__print_last_io();
return;
}
inode = fio->page->mapping->host;
pid = page_private(fio->page);
major = MAJOR(inode->i_sb->s_dev);
minor = MINOR(inode->i_sb->s_dev);
if (last_io.major == major && last_io.minor == minor &&
last_io.pid == pid &&
last_io.type == __file_type(inode, pid) &&
last_io.fio.op == fio->op &&
last_io.fio.op_flags == fio->op_flags &&
last_io.fio.new_blkaddr + last_io.len ==
fio->new_blkaddr) {
last_io.len++;
return;
}
__print_last_io();
last_io.major = major;
last_io.minor = minor;
last_io.pid = pid;
last_io.type = __file_type(inode, pid);
last_io.fio = *fio;
last_io.len = 1;
return;
}
void f2fs_build_trace_ios(void)
{
spin_lock_init(&pids_lock);
}
#define PIDVEC_SIZE 128
static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index,
unsigned int max_items)
{
struct radix_tree_iter iter;
void **slot;
unsigned int ret = 0;
if (unlikely(!max_items))
return 0;
radix_tree_for_each_slot(slot, &pids, &iter, first_index) {
results[ret] = iter.index;
if (++ret == max_items)
break;
}
return ret;
}
void f2fs_destroy_trace_ios(void)
{
pid_t pid[PIDVEC_SIZE];
pid_t next_pid = 0;
unsigned int found;
spin_lock(&pids_lock);
while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) {
unsigned idx;
next_pid = pid[found - 1] + 1;
for (idx = 0; idx < found; idx++)
radix_tree_delete(&pids, pid[idx]);
}
spin_unlock(&pids_lock);
}

View File

@ -1,43 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* f2fs IO tracer
*
* Copyright (c) 2014 Motorola Mobility
* Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
*/
#ifndef __F2FS_TRACE_H__
#define __F2FS_TRACE_H__
#ifdef CONFIG_F2FS_IO_TRACE
#include <trace/events/f2fs.h>
enum file_type {
__NORMAL_FILE,
__DIR_FILE,
__NODE_FILE,
__META_FILE,
__ATOMIC_FILE,
__VOLATILE_FILE,
__MISC_FILE,
};
struct last_io_info {
int major, minor;
pid_t pid;
enum file_type type;
struct f2fs_io_info fio;
block_t len;
};
extern void f2fs_trace_pid(struct page *);
extern void f2fs_trace_ios(struct f2fs_io_info *, int);
extern void f2fs_build_trace_ios(void);
extern void f2fs_destroy_trace_ios(void);
#else
#define f2fs_trace_pid(p)
#define f2fs_trace_ios(i, n)
#define f2fs_build_trace_ios()
#define f2fs_destroy_trace_ios()
#endif
#endif /* __F2FS_TRACE_H__ */

View File

@ -29,6 +29,8 @@
#include "f2fs.h"
#include "xattr.h"
#define F2FS_VERIFY_VER (1)
static inline loff_t f2fs_verity_metadata_pos(const struct inode *inode)
{
return round_up(inode->i_size, 65536);
@ -134,7 +136,7 @@ static int f2fs_begin_enable_verity(struct file *filp)
* here and not rely on ->open() doing it. This must be done before
* evicting the inline data.
*/
err = dquot_initialize(inode);
err = f2fs_dquot_initialize(inode);
if (err)
return err;
@ -150,40 +152,73 @@ static int f2fs_end_enable_verity(struct file *filp, const void *desc,
size_t desc_size, u64 merkle_tree_size)
{
struct inode *inode = file_inode(filp);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
u64 desc_pos = f2fs_verity_metadata_pos(inode) + merkle_tree_size;
struct fsverity_descriptor_location dloc = {
.version = cpu_to_le32(1),
.version = cpu_to_le32(F2FS_VERIFY_VER),
.size = cpu_to_le32(desc_size),
.pos = cpu_to_le64(desc_pos),
};
int err = 0;
int err = 0, err2 = 0;
if (desc != NULL) {
/* Succeeded; write the verity descriptor. */
err = pagecache_write(inode, desc, desc_size, desc_pos);
/*
* If an error already occurred (which fs/verity/ signals by passing
* desc == NULL), then only clean-up is needed.
*/
if (desc == NULL)
goto cleanup;
/* Write all pages before clearing FI_VERITY_IN_PROGRESS. */
if (!err)
err = filemap_write_and_wait(inode->i_mapping);
}
/* Append the verity descriptor. */
err = pagecache_write(inode, desc, desc_size, desc_pos);
if (err)
goto cleanup;
/* If we failed, truncate anything we wrote past i_size. */
if (desc == NULL || err)
f2fs_truncate(inode);
/*
* Write all pages (both data and verity metadata). Note that this must
* happen before clearing FI_VERITY_IN_PROGRESS; otherwise pages beyond
* i_size won't be written properly. For crash consistency, this also
* must happen before the verity inode flag gets persisted.
*/
err = filemap_write_and_wait(inode->i_mapping);
if (err)
goto cleanup;
/* Set the verity xattr. */
err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY,
F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc),
NULL, XATTR_CREATE);
if (err)
goto cleanup;
/* Finally, set the verity inode flag. */
file_set_verity(inode);
f2fs_set_inode_flags(inode);
f2fs_mark_inode_dirty_sync(inode, true);
clear_inode_flag(inode, FI_VERITY_IN_PROGRESS);
return 0;
if (desc != NULL && !err) {
err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY,
F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc),
NULL, XATTR_CREATE);
if (!err) {
file_set_verity(inode);
f2fs_set_inode_flags(inode);
f2fs_mark_inode_dirty_sync(inode, true);
}
cleanup:
/*
* Verity failed to be enabled, so clean up by truncating any verity
* metadata that was written beyond i_size (both from cache and from
* disk) and clearing FI_VERITY_IN_PROGRESS.
*
* Taking i_gc_rwsem[WRITE] is needed to stop f2fs garbage collection
* from re-instantiating cached pages we are truncating (since unlike
* normal file accesses, garbage collection isn't limited by i_size).
*/
f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
truncate_inode_pages(inode->i_mapping, inode->i_size);
err2 = f2fs_truncate(inode);
if (err2) {
f2fs_err(sbi, "Truncating verity metadata failed (errno=%d)",
err2);
set_sbi_flag(sbi, SBI_NEED_FSCK);
}
return err;
f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
clear_inode_flag(inode, FI_VERITY_IN_PROGRESS);
return err ?: err2;
}
static int f2fs_get_verity_descriptor(struct inode *inode, void *buf,
@ -199,7 +234,7 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf,
F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), NULL);
if (res < 0 && res != -ERANGE)
return res;
if (res != sizeof(dloc) || dloc.version != cpu_to_le32(1)) {
if (res != sizeof(dloc) || dloc.version != cpu_to_le32(F2FS_VERIFY_VER)) {
f2fs_warn(F2FS_I_SB(inode), "unknown verity xattr format");
return -EINVAL;
}

View File

@ -27,7 +27,8 @@ static void *xattr_alloc(struct f2fs_sb_info *sbi, int size, bool *is_inline)
{
if (likely(size == sbi->inline_xattr_slab_size)) {
*is_inline = true;
return kmem_cache_zalloc(sbi->inline_xattr_slab, GFP_NOFS);
return f2fs_kmem_cache_alloc(sbi->inline_xattr_slab,
GFP_F2FS_ZERO, false, sbi);
}
*is_inline = false;
return f2fs_kzalloc(sbi, size, GFP_NOFS);
@ -39,7 +40,7 @@ static void xattr_free(struct f2fs_sb_info *sbi, void *xattr_addr,
if (is_inline)
kmem_cache_free(sbi->inline_xattr_slab, xattr_addr);
else
kvfree(xattr_addr);
kfree(xattr_addr);
}
static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
@ -175,8 +176,8 @@ const struct xattr_handler f2fs_xattr_trusted_handler = {
const struct xattr_handler f2fs_xattr_advise_handler = {
.name = F2FS_SYSTEM_ADVISE_NAME,
.flags = F2FS_XATTR_INDEX_ADVISE,
.get = f2fs_xattr_advise_get,
.set = f2fs_xattr_advise_set,
.get = f2fs_xattr_advise_get,
.set = f2fs_xattr_advise_set,
};
const struct xattr_handler f2fs_xattr_security_handler = {
@ -223,15 +224,18 @@ static inline const struct xattr_handler *f2fs_xattr_handler(int index)
}
static struct f2fs_xattr_entry *__find_xattr(void *base_addr,
void *last_base_addr, int index,
size_t len, const char *name)
void *last_base_addr, void **last_addr,
int index, size_t len, const char *name)
{
struct f2fs_xattr_entry *entry;
list_for_each_xattr(entry, base_addr) {
if ((void *)(entry) + sizeof(__u32) > last_base_addr ||
(void *)XATTR_NEXT_ENTRY(entry) > last_base_addr)
(void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) {
if (last_addr)
*last_addr = entry;
return NULL;
}
if (entry->e_name_index != index)
continue;
@ -251,19 +255,9 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode,
unsigned int inline_size = inline_xattr_size(inode);
void *max_addr = base_addr + inline_size;
list_for_each_xattr(entry, base_addr) {
if ((void *)entry + sizeof(__u32) > max_addr ||
(void *)XATTR_NEXT_ENTRY(entry) > max_addr) {
*last_addr = entry;
return NULL;
}
if (entry->e_name_index != index)
continue;
if (entry->e_name_len != len)
continue;
if (!memcmp(entry->e_name, name, len))
break;
}
entry = __find_xattr(base_addr, max_addr, last_addr, index, len, name);
if (!entry)
return NULL;
/* inline xattr header or entry across max inline xattr size */
if (IS_XATTR_LAST_ENTRY(entry) &&
@ -327,7 +321,7 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
void *last_addr = NULL;
nid_t xnid = F2FS_I(inode)->i_xattr_nid;
unsigned int inline_size = inline_xattr_size(inode);
int err = 0;
int err;
if (!xnid && !inline_size)
return -ENODATA;
@ -365,7 +359,7 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
else
cur_addr = txattr_addr;
*xe = __find_xattr(cur_addr, last_txattr_addr, index, len, name);
*xe = __find_xattr(cur_addr, last_txattr_addr, NULL, index, len, name);
if (!*xe) {
f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr",
inode->i_ino);
@ -425,7 +419,7 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage,
*base_addr = txattr_addr;
return 0;
fail:
kvfree(txattr_addr);
kfree(txattr_addr);
return err;
}
@ -486,6 +480,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
f2fs_wait_on_page_writeback(xpage, NODE, true, true);
} else {
struct dnode_of_data dn;
set_new_dnode(&dn, inode, NULL, NULL, new_nid);
xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET);
if (IS_ERR(xpage)) {
@ -515,7 +510,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
void *buffer, size_t buffer_size, struct page *ipage)
{
struct f2fs_xattr_entry *entry = NULL;
int error = 0;
int error;
unsigned int size, len;
void *base_addr = NULL;
int base_size;
@ -528,10 +523,10 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
if (len > F2FS_NAME_LEN)
return -ERANGE;
down_read(&F2FS_I(inode)->i_xattr_sem);
f2fs_down_read(&F2FS_I(inode)->i_xattr_sem);
error = lookup_all_xattrs(inode, ipage, index, len, name,
&entry, &base_addr, &base_size, &is_inline);
up_read(&F2FS_I(inode)->i_xattr_sem);
f2fs_up_read(&F2FS_I(inode)->i_xattr_sem);
if (error)
return error;
@ -562,12 +557,12 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
struct inode *inode = d_inode(dentry);
struct f2fs_xattr_entry *entry;
void *base_addr, *last_base_addr;
int error = 0;
int error;
size_t rest = buffer_size;
down_read(&F2FS_I(inode)->i_xattr_sem);
f2fs_down_read(&F2FS_I(inode)->i_xattr_sem);
error = read_all_xattrs(inode, NULL, &base_addr);
up_read(&F2FS_I(inode)->i_xattr_sem);
f2fs_up_read(&F2FS_I(inode)->i_xattr_sem);
if (error)
return error;
@ -610,7 +605,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
}
error = buffer_size - rest;
cleanup:
kvfree(base_addr);
kfree(base_addr);
return error;
}
@ -632,7 +627,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
int found, newsize;
size_t len;
__u32 new_hsize;
int error = 0;
int error;
if (name == NULL)
return -EINVAL;
@ -655,7 +650,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
last_base_addr = (void *)base_addr + XATTR_SIZE(inode);
/* find entry with wanted name. */
here = __find_xattr(base_addr, last_base_addr, index, len, name);
here = __find_xattr(base_addr, last_base_addr, NULL, index, len, name);
if (!here) {
f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr",
inode->i_ino);
@ -673,7 +668,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
}
if (value && f2fs_xattr_value_same(here, value, size))
goto exit;
goto same;
} else if ((flags & XATTR_REPLACE)) {
error = -ENODATA;
goto exit;
@ -683,6 +678,8 @@ static int __f2fs_setxattr(struct inode *inode, int index,
while (!IS_XATTR_LAST_ENTRY(last)) {
if ((void *)(last) + sizeof(__u32) > last_base_addr ||
(void *)XATTR_NEXT_ENTRY(last) > last_base_addr) {
f2fs_err(F2FS_I_SB(inode), "inode (%lu) has invalid last xattr entry, entry_size: %zu",
inode->i_ino, ENTRY_SIZE(last));
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
error = -EFSCORRUPTED;
goto exit;
@ -745,19 +742,22 @@ static int __f2fs_setxattr(struct inode *inode, int index,
if (error)
goto exit;
if (is_inode_flag_set(inode, FI_ACL_MODE)) {
inode->i_mode = F2FS_I(inode)->i_acl_mode;
inode->i_ctime = current_time(inode);
clear_inode_flag(inode, FI_ACL_MODE);
}
if (index == F2FS_XATTR_INDEX_ENCRYPTION &&
!strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT))
f2fs_set_encrypted_inode(inode);
f2fs_mark_inode_dirty_sync(inode, true);
if (!error && S_ISDIR(inode->i_mode))
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP);
same:
if (is_inode_flag_set(inode, FI_ACL_MODE)) {
inode->i_mode = F2FS_I(inode)->i_acl_mode;
inode->i_ctime = current_time(inode);
clear_inode_flag(inode, FI_ACL_MODE);
}
exit:
kvfree(base_addr);
kfree(base_addr);
return error;
}
@ -773,7 +773,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
if (!f2fs_is_checkpoint_ready(sbi))
return -ENOSPC;
err = dquot_initialize(inode);
err = f2fs_dquot_initialize(inode);
if (err)
return err;
@ -784,9 +784,9 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
down_write(&F2FS_I(inode)->i_xattr_sem);
f2fs_down_write(&F2FS_I(inode)->i_xattr_sem);
err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags);
up_write(&F2FS_I(inode)->i_xattr_sem);
f2fs_up_write(&F2FS_I(inode)->i_xattr_sem);
f2fs_unlock_op(sbi);
f2fs_update_time(sbi, REQ_TIME);

View File

@ -304,6 +304,7 @@ struct file *alloc_file(const struct path *path, fmode_t mode,
file->f_inode = path->dentry->d_inode;
file->f_mapping = path->dentry->d_inode->i_mapping;
file->f_wb_err = filemap_sample_wb_err(file->f_mapping);
file->f_sb_err = file_sample_sb_err(file);
if ((mode & FMODE_READ) &&
likely(fop->read || fop->read_iter))
mode |= FMODE_CAN_READ;

View File

@ -1554,25 +1554,31 @@ retry:
}
EXPORT_SYMBOL(iput);
#ifdef CONFIG_BLOCK
/**
* bmap - find a block number in a file
* @inode: inode of file
* @block: block to find
* @inode: inode owning the block number being requested
* @block: pointer containing the block to find
*
* Returns the block number on the device holding the inode that
* is the disk block number for the block of the file requested.
* That is, asked for block 4 of inode 1 the function will return the
* disk block relative to the disk start that holds that block of the
* file.
* Replaces the value in *block with the block number on the device holding
* corresponding to the requested block number in the file.
* That is, asked for block 4 of inode 1 the function will replace the
* 4 in *block, with disk block relative to the disk start that holds that
* block of the file.
*
* Returns -EINVAL in case of error, 0 otherwise. If mapping falls into a
* hole, returns 0 and *block is also set to 0.
*/
sector_t bmap(struct inode *inode, sector_t block)
int bmap(struct inode *inode, sector_t *block)
{
sector_t res = 0;
if (inode->i_mapping->a_ops->bmap)
res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block);
return res;
if (!inode->i_mapping->a_ops->bmap)
return -EINVAL;
*block = inode->i_mapping->a_ops->bmap(inode->i_mapping, *block);
return 0;
}
EXPORT_SYMBOL(bmap);
#endif
/*
* Update times in overlayed inode from underlying real inode

View File

@ -173,6 +173,34 @@ static int fiemap_check_ranges(struct super_block *sb,
return 0;
}
int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 *len, u32 supported_flags)
{
u64 maxbytes = inode->i_sb->s_maxbytes;
u32 incompat_flags;
if (*len == 0)
return -EINVAL;
if (start > maxbytes)
return -EFBIG;
/*
* Shrink request scope to what the fs can actually handle.
*/
if (*len > maxbytes || (maxbytes - *len) < start)
*len = maxbytes - start;
supported_flags &= FIEMAP_FLAGS_COMPAT;
incompat_flags = fieinfo->fi_flags & ~supported_flags;
if (incompat_flags) {
fieinfo->fi_flags = incompat_flags;
return -EBADR;
}
return 0;
}
EXPORT_SYMBOL(fiemap_prep);
static int ioctl_fiemap(struct file *filp, unsigned long arg)
{
struct fiemap fiemap;

View File

@ -803,18 +803,23 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
{
int err = 0;
unsigned long long ret;
sector_t block = 0;
if (journal->j_inode) {
ret = bmap(journal->j_inode, blocknr);
if (ret)
*retp = ret;
else {
block = blocknr;
ret = bmap(journal->j_inode, &block);
if (ret || !block) {
printk(KERN_ALERT "%s: journal block not found "
"at offset %lu on %s\n",
__func__, blocknr, journal->j_devname);
err = -EIO;
__journal_abort_soft(journal, err);
} else {
*retp = block;
}
} else {
*retp = blocknr; /* +journal->j_blk_offset */
}
@ -1240,11 +1245,14 @@ journal_t *jbd2_journal_init_dev(struct block_device *bdev,
journal_t *jbd2_journal_init_inode(struct inode *inode)
{
journal_t *journal;
sector_t blocknr;
char *p;
unsigned long long blocknr;
int err = 0;
blocknr = bmap(inode, 0);
if (!blocknr) {
blocknr = 0;
err = bmap(inode, &blocknr);
if (err || !blocknr) {
pr_err("%s: Cannot locate journal superblock\n",
__func__);
return NULL;

View File

@ -1227,27 +1227,38 @@ bool is_empty_dir_inode(struct inode *inode)
}
#ifdef CONFIG_UNICODE
bool needs_casefold(const struct inode *dir)
/*
* Determine if the name of a dentry should be casefolded.
*
* Return: if names will need casefolding
*/
static bool needs_casefold(const struct inode *dir)
{
return IS_CASEFOLDED(dir) && dir->i_sb->s_encoding &&
(!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir));
return IS_CASEFOLDED(dir) && dir->i_sb->s_encoding;
}
EXPORT_SYMBOL(needs_casefold);
int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
const char *str, const struct qstr *name)
/**
* generic_ci_d_compare - generic d_compare implementation for casefolding filesystems
* @dentry: dentry whose name we are checking against
* @len: len of name of dentry
* @str: str pointer to name of dentry
* @name: Name to compare against
*
* Return: 0 if names match, 1 if mismatch, or -ERRNO
*/
static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
const char *str, const struct qstr *name)
{
const struct dentry *parent = READ_ONCE(dentry->d_parent);
const struct inode *inode = READ_ONCE(parent->d_inode);
const struct inode *dir = READ_ONCE(parent->d_inode);
const struct super_block *sb = dentry->d_sb;
const struct unicode_map *um = sb->s_encoding;
struct qstr entry = QSTR_INIT(str, len);
struct qstr qstr = QSTR_INIT(str, len);
char strbuf[DNAME_INLINE_LEN];
int ret;
if (!inode || !needs_casefold(inode))
if (!dir || !needs_casefold(dir))
goto fallback;
/*
* If the dentry name is stored in-line, then it may be concurrently
* modified by a rename. If this happens, the VFS will eventually retry
@ -1258,47 +1269,44 @@ int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
if (len <= DNAME_INLINE_LEN - 1) {
memcpy(strbuf, str, len);
strbuf[len] = 0;
entry.name = strbuf;
qstr.name = strbuf;
/* prevent compiler from optimizing out the temporary buffer */
barrier();
}
ret = utf8_strncasecmp(um, name, &entry);
ret = utf8_strncasecmp(um, name, &qstr);
if (ret >= 0)
return ret;
if (sb_has_enc_strict_mode(sb))
if (sb_has_strict_encoding(sb))
return -EINVAL;
fallback:
if (len != name->len)
return 1;
return !!memcmp(str, name->name, len);
}
EXPORT_SYMBOL(generic_ci_d_compare);
int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
/**
* generic_ci_d_hash - generic d_hash implementation for casefolding filesystems
* @dentry: dentry of the parent directory
* @str: qstr of name whose hash we should fill in
*
* Return: 0 if hash was successful or unchanged, and -EINVAL on error
*/
static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
{
const struct inode *inode = READ_ONCE(dentry->d_inode);
const struct inode *dir = READ_ONCE(dentry->d_inode);
struct super_block *sb = dentry->d_sb;
const struct unicode_map *um = sb->s_encoding;
int ret = 0;
if (!inode || !needs_casefold(inode))
if (!dir || !needs_casefold(dir))
return 0;
ret = utf8_casefold_hash(um, dentry, str);
if (ret < 0)
goto err;
if (ret < 0 && sb_has_strict_encoding(sb))
return -EINVAL;
return 0;
err:
if (sb_has_enc_strict_mode(sb))
ret = -EINVAL;
else
ret = 0;
return ret;
}
EXPORT_SYMBOL(generic_ci_d_hash);
static const struct dentry_operations generic_ci_dentry_ops = {
.d_hash = generic_ci_d_hash,
@ -1312,7 +1320,7 @@ static const struct dentry_operations generic_encrypted_dentry_ops = {
};
#endif
#if IS_ENABLED(CONFIG_UNICODE) && IS_ENABLED(CONFIG_FS_ENCRYPTION)
#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE)
static const struct dentry_operations generic_encrypted_ci_dentry_ops = {
.d_hash = generic_ci_d_hash,
.d_compare = generic_ci_d_compare,
@ -1322,28 +1330,48 @@ static const struct dentry_operations generic_encrypted_ci_dentry_ops = {
/**
* generic_set_encrypted_ci_d_ops - helper for setting d_ops for given dentry
* @dir: parent of dentry whose ops to set
* @dentry: detnry to set ops on
* @dentry: dentry to set ops on
*
* This function sets the dentry ops for the given dentry to handle both
* casefolding and encryption of the dentry name.
* Casefolded directories need d_hash and d_compare set, so that the dentries
* contained in them are handled case-insensitively. Note that these operations
* are needed on the parent directory rather than on the dentries in it, and
* while the casefolding flag can be toggled on and off on an empty directory,
* dentry_operations can't be changed later. As a result, if the filesystem has
* casefolding support enabled at all, we have to give all dentries the
* casefolding operations even if their inode doesn't have the casefolding flag
* currently (and thus the casefolding ops would be no-ops for now).
*
* Encryption works differently in that the only dentry operation it needs is
* d_revalidate, which it only needs on dentries that have the no-key name flag.
* The no-key flag can't be set "later", so we don't have to worry about that.
*
* Finally, to maximize compatibility with overlayfs (which isn't compatible
* with certain dentry operations) and to avoid taking an unnecessary
* performance hit, we use custom dentry_operations for each possible
* combination rather than always installing all operations.
*/
void generic_set_encrypted_ci_d_ops(struct inode *dir, struct dentry *dentry)
void generic_set_encrypted_ci_d_ops(struct dentry *dentry)
{
#ifdef CONFIG_FS_ENCRYPTION
if (dentry->d_flags & DCACHE_ENCRYPTED_NAME) {
#ifdef CONFIG_UNICODE
if (dir->i_sb->s_encoding) {
d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops);
return;
}
bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME;
#endif
#ifdef CONFIG_UNICODE
bool needs_ci_ops = dentry->d_sb->s_encoding;
#endif
#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE)
if (needs_encrypt_ops && needs_ci_ops) {
d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops);
return;
}
#endif
#ifdef CONFIG_FS_ENCRYPTION
if (needs_encrypt_ops) {
d_set_d_op(dentry, &generic_encrypted_dentry_ops);
return;
}
#endif
#ifdef CONFIG_UNICODE
if (dir->i_sb->s_encoding) {
if (needs_ci_ops) {
d_set_d_op(dentry, &generic_ci_dentry_ops);
return;
}

View File

@ -739,9 +739,8 @@ static int do_dentry_open(struct file *f,
path_get(&f->f_path);
f->f_inode = inode;
f->f_mapping = inode->i_mapping;
/* Ensure that we skip any errors that predate opening of the file */
f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
f->f_sb_err = file_sample_sb_err(f);
if (unlikely(f->f_flags & O_PATH)) {
f->f_mode = FMODE_PATH;

View File

@ -710,21 +710,18 @@ EXPORT_SYMBOL(dquot_quota_sync);
static unsigned long
dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
struct list_head *head;
struct dquot *dquot;
unsigned long freed = 0;
spin_lock(&dq_list_lock);
head = free_dquots.prev;
while (head != &free_dquots && sc->nr_to_scan) {
dquot = list_entry(head, struct dquot, dq_free);
while (!list_empty(&free_dquots) && sc->nr_to_scan) {
dquot = list_first_entry(&free_dquots, struct dquot, dq_free);
remove_dquot_hash(dquot);
remove_free_dquot(dquot);
remove_inuse(dquot);
do_destroy_dquot(dquot);
sc->nr_to_scan--;
freed++;
head = free_dquots.prev;
}
spin_unlock(&dq_list_lock);
return freed;
@ -2152,7 +2149,7 @@ int dquot_file_open(struct inode *inode, struct file *file)
error = generic_file_open(inode, file);
if (!error && (file->f_mode & FMODE_WRITE))
dquot_initialize(inode);
error = dquot_initialize(inode);
return error;
}
EXPORT_SYMBOL(dquot_file_open);
@ -2971,7 +2968,7 @@ static int __init dquot_init(void)
NULL);
order = 0;
dquot_hash = (struct hlist_head *)__get_free_pages(GFP_ATOMIC, order);
dquot_hash = (struct hlist_head *)__get_free_pages(GFP_KERNEL, order);
if (!dquot_hash)
panic("Cannot create dquot hash table");

View File

@ -156,7 +156,7 @@ SYSCALL_DEFINE1(syncfs, int, fd)
{
struct fd f = fdget(fd);
struct super_block *sb;
int ret;
int ret, ret2;
if (!f.file)
return -EBADF;
@ -166,8 +166,10 @@ SYSCALL_DEFINE1(syncfs, int, fd)
ret = sync_filesystem(sb);
up_read(&sb->s_umount);
ret2 = errseq_check_and_advance(&sb->s_wb_err, &f.file->f_sb_err);
fdput(f);
return ret;
return ret ? ret : ret2;
}
/**

View File

@ -222,7 +222,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino);
err = fscrypt_prepare_lookup(dir, dentry, &nm);
ubifs_set_d_ops(dir, dentry);
generic_set_encrypted_ci_d_ops(dentry);
if (err == -ENOENT)
return d_splice_alias(NULL, dentry);
if (err)

View File

@ -138,7 +138,7 @@ int utf8_casefold_hash(const struct unicode_map *um, const void *salt,
while ((c = utf8byte(&cur))) {
if (c < 0)
return c;
return -EINVAL;
hash = partial_name_hash((unsigned char)c, hash);
}
str->hash = end_name_hash(hash);

View File

@ -5,6 +5,7 @@ obj-$(CONFIG_FS_VERITY) += enable.o \
init.o \
measure.o \
open.o \
read_metadata.o \
verify.o
obj-$(CONFIG_FS_VERITY_BUILTIN_SIGNATURES) += signature.o

View File

@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
* fs/verity/enable.c: ioctl to enable verity on a file
* Ioctl to enable verity on a file
*
* Copyright 2019 Google LLC
*/
@ -398,9 +398,9 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg)
* Some pages of the file may have been evicted from pagecache after
* being used in the Merkle tree construction, then read into pagecache
* again by another process reading from the file concurrently. Since
* these pages didn't undergo verification against the file measurement
* which fs-verity now claims to be enforcing, we have to wipe the
* pagecache to ensure that all future reads are verified.
* these pages didn't undergo verification against the file digest which
* fs-verity now claims to be enforcing, we have to wipe the pagecache
* to ensure that all future reads are verified.
*/
filemap_write_and_wait(inode->i_mapping);
invalidate_inode_pages2(inode->i_mapping);

View File

@ -67,52 +67,22 @@ struct merkle_tree_params {
* When a verity file is first opened, an instance of this struct is allocated
* and stored in ->i_verity_info; it remains until the inode is evicted. It
* caches information about the Merkle tree that's needed to efficiently verify
* data read from the file. It also caches the file measurement. The Merkle
* tree pages themselves are not cached here, but the filesystem may cache them.
* data read from the file. It also caches the file digest. The Merkle tree
* pages themselves are not cached here, but the filesystem may cache them.
*/
struct fsverity_info {
struct merkle_tree_params tree_params;
u8 root_hash[FS_VERITY_MAX_DIGEST_SIZE];
u8 measurement[FS_VERITY_MAX_DIGEST_SIZE];
u8 file_digest[FS_VERITY_MAX_DIGEST_SIZE];
const struct inode *inode;
};
/*
* Merkle tree properties. The file measurement is the hash of this structure
* excluding the signature and with the sig_size field set to 0.
*/
struct fsverity_descriptor {
__u8 version; /* must be 1 */
__u8 hash_algorithm; /* Merkle tree hash algorithm */
__u8 log_blocksize; /* log2 of size of data and tree blocks */
__u8 salt_size; /* size of salt in bytes; 0 if none */
__le32 sig_size; /* size of signature in bytes; 0 if none */
__le64 data_size; /* size of file the Merkle tree is built over */
__u8 root_hash[64]; /* Merkle tree root hash */
__u8 salt[32]; /* salt prepended to each hashed block */
__u8 __reserved[144]; /* must be 0's */
__u8 signature[]; /* optional PKCS#7 signature */
};
/* Arbitrary limit to bound the kmalloc() size. Can be changed. */
#define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384
#define FS_VERITY_MAX_SIGNATURE_SIZE (FS_VERITY_MAX_DESCRIPTOR_SIZE - \
sizeof(struct fsverity_descriptor))
/*
* Format in which verity file measurements are signed. This is the same as
* 'struct fsverity_digest', except here some magic bytes are prepended to
* provide some context about what is being signed in case the same key is used
* for non-fsverity purposes, and here the fields have fixed endianness.
*/
struct fsverity_signed_digest {
char magic[8]; /* must be "FSVerity" */
__le16 digest_algorithm;
__le16 digest_size;
__u8 digest[];
};
/* hash_algs.c */
extern struct fsverity_hash_alg fsverity_hash_algs[];
@ -152,12 +122,17 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
const u8 *salt, size_t salt_size);
struct fsverity_info *fsverity_create_info(const struct inode *inode,
void *desc, size_t desc_size);
struct fsverity_descriptor *desc,
size_t desc_size);
void fsverity_set_info(struct inode *inode, struct fsverity_info *vi);
void fsverity_free_info(struct fsverity_info *vi);
int fsverity_get_descriptor(struct inode *inode,
struct fsverity_descriptor **desc_ret,
size_t *desc_size_ret);
int __init fsverity_init_info_cache(void);
void __init fsverity_exit_info_cache(void);
@ -165,15 +140,13 @@ void __init fsverity_exit_info_cache(void);
#ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES
int fsverity_verify_signature(const struct fsverity_info *vi,
const struct fsverity_descriptor *desc,
size_t desc_size);
const u8 *signature, size_t sig_size);
int __init fsverity_init_signature(void);
#else /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */
static inline int
fsverity_verify_signature(const struct fsverity_info *vi,
const struct fsverity_descriptor *desc,
size_t desc_size)
const u8 *signature, size_t sig_size)
{
return 0;
}

View File

@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
* fs/verity/hash_algs.c: fs-verity hash algorithms
* fs-verity hash algorithms
*
* Copyright 2019 Google LLC
*/

View File

@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
* fs/verity/init.c: fs-verity module initialization and logging
* fs-verity module initialization and logging
*
* Copyright 2019 Google LLC
*/

View File

@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
* fs/verity/measure.c: ioctl to get a verity file's measurement
* Ioctl to get a verity file's digest
*
* Copyright 2019 Google LLC
*/
@ -10,12 +10,12 @@
#include <linux/uaccess.h>
/**
* fsverity_ioctl_measure() - get a verity file's measurement
* @filp: file to get measurement of
* fsverity_ioctl_measure() - get a verity file's digest
* @filp: file to get digest of
* @_uarg: user pointer to fsverity_digest
*
* Retrieve the file measurement that the kernel is enforcing for reads from a
* verity file. See the "FS_IOC_MEASURE_VERITY" section of
* Retrieve the file digest that the kernel is enforcing for reads from a verity
* file. See the "FS_IOC_MEASURE_VERITY" section of
* Documentation/filesystems/fsverity.rst for the documentation.
*
* Return: 0 on success, -errno on failure
@ -51,7 +51,7 @@ int fsverity_ioctl_measure(struct file *filp, void __user *_uarg)
if (copy_to_user(uarg, &arg, sizeof(arg)))
return -EFAULT;
if (copy_to_user(uarg->digest, vi->measurement, hash_alg->digest_size))
if (copy_to_user(uarg->digest, vi->file_digest, hash_alg->digest_size))
return -EFAULT;
return 0;

View File

@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
* fs/verity/open.c: opening fs-verity files
* Opening fs-verity files
*
* Copyright 2019 Google LLC
*/
@ -124,63 +124,35 @@ out_err:
}
/*
* Compute the file measurement by hashing the fsverity_descriptor excluding the
* Compute the file digest by hashing the fsverity_descriptor excluding the
* signature and with the sig_size field set to 0.
*/
static int compute_file_measurement(struct fsverity_hash_alg *hash_alg,
struct fsverity_descriptor *desc,
u8 *measurement)
static int compute_file_digest(struct fsverity_hash_alg *hash_alg,
struct fsverity_descriptor *desc,
u8 *file_digest)
{
__le32 sig_size = desc->sig_size;
int err;
desc->sig_size = 0;
err = fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), measurement);
err = fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), file_digest);
desc->sig_size = sig_size;
return err;
}
/*
* Validate the given fsverity_descriptor and create a new fsverity_info from
* it. The signature (if present) is also checked.
* Create a new fsverity_info from the given fsverity_descriptor (with optional
* appended signature), and check the signature if present. The
* fsverity_descriptor must have already undergone basic validation.
*/
struct fsverity_info *fsverity_create_info(const struct inode *inode,
void *_desc, size_t desc_size)
struct fsverity_descriptor *desc,
size_t desc_size)
{
struct fsverity_descriptor *desc = _desc;
struct fsverity_info *vi;
int err;
if (desc_size < sizeof(*desc)) {
fsverity_err(inode, "Unrecognized descriptor size: %zu bytes",
desc_size);
return ERR_PTR(-EINVAL);
}
if (desc->version != 1) {
fsverity_err(inode, "Unrecognized descriptor version: %u",
desc->version);
return ERR_PTR(-EINVAL);
}
if (memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) {
fsverity_err(inode, "Reserved bits set in descriptor");
return ERR_PTR(-EINVAL);
}
if (desc->salt_size > sizeof(desc->salt)) {
fsverity_err(inode, "Invalid salt_size: %u", desc->salt_size);
return ERR_PTR(-EINVAL);
}
if (le64_to_cpu(desc->data_size) != inode->i_size) {
fsverity_err(inode,
"Wrong data_size: %llu (desc) != %lld (inode)",
le64_to_cpu(desc->data_size), inode->i_size);
return ERR_PTR(-EINVAL);
}
vi = kmem_cache_zalloc(fsverity_info_cachep, GFP_KERNEL);
if (!vi)
return ERR_PTR(-ENOMEM);
@ -199,17 +171,18 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode,
memcpy(vi->root_hash, desc->root_hash, vi->tree_params.digest_size);
err = compute_file_measurement(vi->tree_params.hash_alg, desc,
vi->measurement);
err = compute_file_digest(vi->tree_params.hash_alg, desc,
vi->file_digest);
if (err) {
fsverity_err(inode, "Error %d computing file measurement", err);
fsverity_err(inode, "Error %d computing file digest", err);
goto out;
}
pr_debug("Computed file measurement: %s:%*phN\n",
pr_debug("Computed file digest: %s:%*phN\n",
vi->tree_params.hash_alg->name,
vi->tree_params.digest_size, vi->measurement);
vi->tree_params.digest_size, vi->file_digest);
err = fsverity_verify_signature(vi, desc, desc_size);
err = fsverity_verify_signature(vi, desc->signature,
le32_to_cpu(desc->sig_size));
out:
if (err) {
fsverity_free_info(vi);
@ -221,11 +194,20 @@ out:
void fsverity_set_info(struct inode *inode, struct fsverity_info *vi)
{
/*
* Multiple processes may race to set ->i_verity_info, so use cmpxchg.
* This pairs with the READ_ONCE() in fsverity_get_info().
* Multiple tasks may race to set ->i_verity_info, so use
* cmpxchg_release(). This pairs with the smp_load_acquire() in
* fsverity_get_info(). I.e., here we publish ->i_verity_info with a
* RELEASE barrier so that other tasks can ACQUIRE it.
*/
if (cmpxchg(&inode->i_verity_info, NULL, vi) != NULL)
if (cmpxchg_release(&inode->i_verity_info, NULL, vi) != NULL) {
/* Lost the race, so free the fsverity_info we allocated. */
fsverity_free_info(vi);
/*
* Afterwards, the caller may access ->i_verity_info directly,
* so make sure to ACQUIRE the winning fsverity_info.
*/
(void)fsverity_get_info(inode);
}
}
void fsverity_free_info(struct fsverity_info *vi)
@ -236,15 +218,57 @@ void fsverity_free_info(struct fsverity_info *vi)
kmem_cache_free(fsverity_info_cachep, vi);
}
/* Ensure the inode has an ->i_verity_info */
static int ensure_verity_info(struct inode *inode)
static bool validate_fsverity_descriptor(struct inode *inode,
const struct fsverity_descriptor *desc,
size_t desc_size)
{
struct fsverity_info *vi = fsverity_get_info(inode);
struct fsverity_descriptor *desc;
int res;
if (desc_size < sizeof(*desc)) {
fsverity_err(inode, "Unrecognized descriptor size: %zu bytes",
desc_size);
return false;
}
if (vi)
return 0;
if (desc->version != 1) {
fsverity_err(inode, "Unrecognized descriptor version: %u",
desc->version);
return false;
}
if (memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) {
fsverity_err(inode, "Reserved bits set in descriptor");
return false;
}
if (desc->salt_size > sizeof(desc->salt)) {
fsverity_err(inode, "Invalid salt_size: %u", desc->salt_size);
return false;
}
if (le64_to_cpu(desc->data_size) != inode->i_size) {
fsverity_err(inode,
"Wrong data_size: %llu (desc) != %lld (inode)",
le64_to_cpu(desc->data_size), inode->i_size);
return false;
}
if (le32_to_cpu(desc->sig_size) > desc_size - sizeof(*desc)) {
fsverity_err(inode, "Signature overflows verity descriptor");
return false;
}
return true;
}
/*
* Read the inode's fsverity_descriptor (with optional appended signature) from
* the filesystem, and do basic validation of it.
*/
int fsverity_get_descriptor(struct inode *inode,
struct fsverity_descriptor **desc_ret,
size_t *desc_size_ret)
{
int res;
struct fsverity_descriptor *desc;
res = inode->i_sb->s_vop->get_verity_descriptor(inode, NULL, 0);
if (res < 0) {
@ -263,20 +287,46 @@ static int ensure_verity_info(struct inode *inode)
res = inode->i_sb->s_vop->get_verity_descriptor(inode, desc, res);
if (res < 0) {
fsverity_err(inode, "Error %d reading verity descriptor", res);
goto out_free_desc;
kfree(desc);
return res;
}
vi = fsverity_create_info(inode, desc, res);
if (!validate_fsverity_descriptor(inode, desc, res)) {
kfree(desc);
return -EINVAL;
}
*desc_ret = desc;
*desc_size_ret = res;
return 0;
}
/* Ensure the inode has an ->i_verity_info */
static int ensure_verity_info(struct inode *inode)
{
struct fsverity_info *vi = fsverity_get_info(inode);
struct fsverity_descriptor *desc;
size_t desc_size;
int err;
if (vi)
return 0;
err = fsverity_get_descriptor(inode, &desc, &desc_size);
if (err)
return err;
vi = fsverity_create_info(inode, desc, desc_size);
if (IS_ERR(vi)) {
res = PTR_ERR(vi);
err = PTR_ERR(vi);
goto out_free_desc;
}
fsverity_set_info(inode, vi);
res = 0;
err = 0;
out_free_desc:
kfree(desc);
return res;
return err;
}
/**

195
fs/verity/read_metadata.c Normal file
View File

@ -0,0 +1,195 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Ioctl to read verity metadata
*
* Copyright 2021 Google LLC
*/
#include "fsverity_private.h"
#include <linux/backing-dev.h>
#include <linux/highmem.h>
#include <linux/sched/signal.h>
#include <linux/uaccess.h>
static int fsverity_read_merkle_tree(struct inode *inode,
const struct fsverity_info *vi,
void __user *buf, u64 offset, int length)
{
const struct fsverity_operations *vops = inode->i_sb->s_vop;
u64 end_offset;
unsigned int offs_in_page;
pgoff_t index, last_index;
int retval = 0;
int err = 0;
end_offset = min(offset + length, vi->tree_params.tree_size);
if (offset >= end_offset)
return 0;
offs_in_page = offset_in_page(offset);
last_index = (end_offset - 1) >> PAGE_SHIFT;
/*
* Iterate through each Merkle tree page in the requested range and copy
* the requested portion to userspace. Note that the Merkle tree block
* size isn't important here, as we are returning a byte stream; i.e.,
* we can just work with pages even if the tree block size != PAGE_SIZE.
*/
for (index = offset >> PAGE_SHIFT; index <= last_index; index++) {
unsigned long num_ra_pages =
min_t(unsigned long, last_index - index + 1,
inode->i_sb->s_bdi->io_pages);
unsigned int bytes_to_copy = min_t(u64, end_offset - offset,
PAGE_SIZE - offs_in_page);
struct page *page;
const void *virt;
page = vops->read_merkle_tree_page(inode, index, num_ra_pages);
if (IS_ERR(page)) {
err = PTR_ERR(page);
fsverity_err(inode,
"Error %d reading Merkle tree page %lu",
err, index);
break;
}
virt = kmap(page);
if (copy_to_user(buf, virt + offs_in_page, bytes_to_copy)) {
kunmap(page);
put_page(page);
err = -EFAULT;
break;
}
kunmap(page);
put_page(page);
retval += bytes_to_copy;
buf += bytes_to_copy;
offset += bytes_to_copy;
if (fatal_signal_pending(current)) {
err = -EINTR;
break;
}
cond_resched();
offs_in_page = 0;
}
return retval ? retval : err;
}
/* Copy the requested portion of the buffer to userspace. */
static int fsverity_read_buffer(void __user *dst, u64 offset, int length,
const void *src, size_t src_length)
{
if (offset >= src_length)
return 0;
src += offset;
src_length -= offset;
length = min_t(size_t, length, src_length);
if (copy_to_user(dst, src, length))
return -EFAULT;
return length;
}
static int fsverity_read_descriptor(struct inode *inode,
void __user *buf, u64 offset, int length)
{
struct fsverity_descriptor *desc;
size_t desc_size;
int res;
res = fsverity_get_descriptor(inode, &desc, &desc_size);
if (res)
return res;
/* don't include the signature */
desc_size = offsetof(struct fsverity_descriptor, signature);
desc->sig_size = 0;
res = fsverity_read_buffer(buf, offset, length, desc, desc_size);
kfree(desc);
return res;
}
static int fsverity_read_signature(struct inode *inode,
void __user *buf, u64 offset, int length)
{
struct fsverity_descriptor *desc;
size_t desc_size;
int res;
res = fsverity_get_descriptor(inode, &desc, &desc_size);
if (res)
return res;
if (desc->sig_size == 0) {
res = -ENODATA;
goto out;
}
/*
* Include only the signature. Note that fsverity_get_descriptor()
* already verified that sig_size is in-bounds.
*/
res = fsverity_read_buffer(buf, offset, length, desc->signature,
le32_to_cpu(desc->sig_size));
out:
kfree(desc);
return res;
}
/**
* fsverity_ioctl_read_metadata() - read verity metadata from a file
* @filp: file to read the metadata from
* @uarg: user pointer to fsverity_read_metadata_arg
*
* Return: length read on success, 0 on EOF, -errno on failure
*/
int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg)
{
struct inode *inode = file_inode(filp);
const struct fsverity_info *vi;
struct fsverity_read_metadata_arg arg;
int length;
void __user *buf;
vi = fsverity_get_info(inode);
if (!vi)
return -ENODATA; /* not a verity file */
/*
* Note that we don't have to explicitly check that the file is open for
* reading, since verity files can only be opened for reading.
*/
if (copy_from_user(&arg, uarg, sizeof(arg)))
return -EFAULT;
if (arg.__reserved)
return -EINVAL;
/* offset + length must not overflow. */
if (arg.offset + arg.length < arg.offset)
return -EINVAL;
/* Ensure that the return value will fit in INT_MAX. */
length = min_t(u64, arg.length, INT_MAX);
buf = u64_to_user_ptr(arg.buf_ptr);
switch (arg.metadata_type) {
case FS_VERITY_METADATA_TYPE_MERKLE_TREE:
return fsverity_read_merkle_tree(inode, vi, buf, arg.offset,
length);
case FS_VERITY_METADATA_TYPE_DESCRIPTOR:
return fsverity_read_descriptor(inode, buf, arg.offset, length);
case FS_VERITY_METADATA_TYPE_SIGNATURE:
return fsverity_read_signature(inode, buf, arg.offset, length);
default:
return -EINVAL;
}
}
EXPORT_SYMBOL_GPL(fsverity_ioctl_read_metadata);

View File

@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
* fs/verity/signature.c: verification of builtin signatures
* Verification of builtin signatures
*
* Copyright 2019 Google LLC
*/
@ -29,22 +29,20 @@ static struct key *fsverity_keyring;
/**
* fsverity_verify_signature() - check a verity file's signature
* @vi: the file's fsverity_info
* @desc: the file's fsverity_descriptor
* @desc_size: size of @desc
* @signature: the file's built-in signature
* @sig_size: size of signature in bytes, or 0 if no signature
*
* If the file's fs-verity descriptor includes a signature of the file
* measurement, verify it against the certificates in the fs-verity keyring.
* If the file includes a signature of its fs-verity file digest, verify it
* against the certificates in the fs-verity keyring.
*
* Return: 0 on success (signature valid or not required); -errno on failure
*/
int fsverity_verify_signature(const struct fsverity_info *vi,
const struct fsverity_descriptor *desc,
size_t desc_size)
const u8 *signature, size_t sig_size)
{
const struct inode *inode = vi->inode;
const struct fsverity_hash_alg *hash_alg = vi->tree_params.hash_alg;
const u32 sig_size = le32_to_cpu(desc->sig_size);
struct fsverity_signed_digest *d;
struct fsverity_formatted_digest *d;
int err;
if (sig_size == 0) {
@ -56,22 +54,16 @@ int fsverity_verify_signature(const struct fsverity_info *vi,
return 0;
}
if (sig_size > desc_size - sizeof(*desc)) {
fsverity_err(inode, "Signature overflows verity descriptor");
return -EBADMSG;
}
d = kzalloc(sizeof(*d) + hash_alg->digest_size, GFP_KERNEL);
if (!d)
return -ENOMEM;
memcpy(d->magic, "FSVerity", 8);
d->digest_algorithm = cpu_to_le16(hash_alg - fsverity_hash_algs);
d->digest_size = cpu_to_le16(hash_alg->digest_size);
memcpy(d->digest, vi->measurement, hash_alg->digest_size);
memcpy(d->digest, vi->file_digest, hash_alg->digest_size);
err = verify_pkcs7_signature(d, sizeof(*d) + hash_alg->digest_size,
desc->signature, sig_size,
fsverity_keyring,
signature, sig_size, fsverity_keyring,
VERIFYING_UNSPECIFIED_SIGNATURE,
NULL, NULL);
kfree(d);
@ -90,8 +82,8 @@ int fsverity_verify_signature(const struct fsverity_info *vi,
return err;
}
pr_debug("Valid signature for file measurement %s:%*phN\n",
hash_alg->name, hash_alg->digest_size, vi->measurement);
pr_debug("Valid signature for file digest %s:%*phN\n",
hash_alg->name, hash_alg->digest_size, vi->file_digest);
return 0;
}

View File

@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
* fs/verity/verify.c: data verification functions, i.e. hooks for ->readpages()
* Data verification functions, i.e. hooks for ->readpages()
*
* Copyright 2019 Google LLC
*/

View File

@ -219,7 +219,7 @@ struct dentry_operations {
#define DCACHE_MAY_FREE 0x00800000
#define DCACHE_FALLTHRU 0x01000000 /* Fall through to lower layer */
#define DCACHE_ENCRYPTED_NAME 0x02000000 /* Encrypted name (dir key was unavailable) */
#define DCACHE_NOKEY_NAME 0x02000000 /* Encrypted name encoded without key */
#define DCACHE_OP_REAL 0x04000000
#define DCACHE_PAR_LOOKUP 0x10000000 /* being looked up (with parent locked shared) */

View File

@ -34,6 +34,7 @@
#define F2FS_ROOT_INO(sbi) ((sbi)->root_ino_num)
#define F2FS_NODE_INO(sbi) ((sbi)->node_ino_num)
#define F2FS_META_INO(sbi) ((sbi)->meta_ino_num)
#define F2FS_COMPRESS_INO(sbi) (NM_I(sbi)->max_nid)
#define F2FS_MAX_QUOTAS 3
@ -168,7 +169,7 @@ struct f2fs_checkpoint {
unsigned char alloc_type[MAX_ACTIVE_LOGS];
/* SIT and NAT version bitmap */
unsigned char sit_nat_version_bitmap[1];
unsigned char sit_nat_version_bitmap[];
} __packed;
#define CP_CHKSUM_OFFSET 4092 /* default chksum offset in checkpoint */
@ -229,6 +230,7 @@ struct f2fs_extent {
#define F2FS_INLINE_DOTS 0x10 /* file having implicit dot dentries */
#define F2FS_EXTRA_ATTR 0x20 /* file having extra attribute */
#define F2FS_PIN_FILE 0x40 /* file should not be gced */
#define F2FS_COMPRESS_RELEASED 0x80 /* file released compressed blocks */
struct f2fs_inode {
__le16 i_mode; /* file mode */
@ -273,7 +275,10 @@ struct f2fs_inode {
__le64 i_compr_blocks; /* # of compressed blocks */
__u8 i_compress_algorithm; /* compress algorithm */
__u8 i_log_cluster_size; /* log of cluster size */
__le16 i_padding; /* padding */
__le16 i_compress_flag; /* compress flag */
/* 0 bit: chksum flag
* [10,15] bits: compress level
*/
__le32 i_extra_end[0]; /* for attribute size calculation */
} __packed;
__le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */

View File

@ -901,9 +901,7 @@ struct file {
#endif /* #ifdef CONFIG_EPOLL */
struct address_space *f_mapping;
errseq_t f_wb_err;
#ifdef CONFIG_FILE_TABLE_DEBUG
struct hlist_node f_hash;
#endif /* #ifdef CONFIG_FILE_TABLE_DEBUG */
errseq_t f_sb_err; /* for syncfs */
} __randomize_layout
__attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
@ -1320,7 +1318,7 @@ extern int send_sigurg(struct fown_struct *fown);
/* These flags relate to encoding and casefolding */
#define SB_ENC_STRICT_MODE_FL (1 << 0)
#define sb_has_enc_strict_mode(sb) \
#define sb_has_strict_encoding(sb) \
(sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL)
/*
@ -1389,7 +1387,10 @@ struct super_block {
#ifdef CONFIG_FS_VERITY
const struct fsverity_operations *s_vop;
#endif
#ifdef CONFIG_UNICODE
struct unicode_map *s_encoding;
__u16 s_encoding_flags;
#endif
struct hlist_bl_head s_anon; /* anonymous dentries for (nfs) exporting */
#ifdef CONFIG_UNICODE
struct unicode_map *s_encoding;
@ -1443,6 +1444,9 @@ struct super_block {
/* Being remounted read-only */
int s_readonly_remount;
/* per-sb errseq_t for reporting writeback errors via syncfs */
errseq_t s_wb_err;
/* AIO completions deferred from interrupt context */
struct workqueue_struct *s_dio_done_wq;
struct hlist_head s_pins;
@ -1668,6 +1672,9 @@ struct fiemap_extent_info {
struct fiemap_extent __user *fi_extents_start; /* Start of
fiemap_extent array */
};
int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 *len, u32 supported_flags);
int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical,
u64 phys, u64 len, u32 flags);
int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
@ -2733,6 +2740,18 @@ static inline errseq_t filemap_sample_wb_err(struct address_space *mapping)
return errseq_sample(&mapping->wb_err);
}
/**
* file_sample_sb_err - sample the current errseq_t to test for later errors
* @mapping: mapping to be sampled
*
* Grab the most current superblock-level errseq_t value for the given
* struct file.
*/
static inline errseq_t file_sample_sb_err(struct file *file)
{
return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err);
}
extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
int datasync);
extern int vfs_fsync(struct file *file, int datasync);
@ -2757,9 +2776,16 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
extern void emergency_sync(void);
extern void emergency_remount(void);
#ifdef CONFIG_BLOCK
extern sector_t bmap(struct inode *, sector_t);
extern int bmap(struct inode *inode, sector_t *block);
#else
static inline int bmap(struct inode *inode, sector_t *block)
{
return -EINVAL;
}
#endif
extern int notify_change(struct dentry *, struct iattr *, struct inode **);
extern int notify_change2(struct vfsmount *, struct dentry *, struct iattr *, struct inode **);
extern int inode_permission(struct inode *, int);
@ -3225,19 +3251,7 @@ extern int generic_file_fsync(struct file *, loff_t, loff_t, int);
extern int generic_check_addressable(unsigned, u64);
#ifdef CONFIG_UNICODE
extern int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str);
extern int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
const char *str, const struct qstr *name);
extern bool needs_casefold(const struct inode *dir);
#else
static inline bool needs_casefold(const struct inode *dir)
{
return 0;
}
#endif
extern void generic_set_encrypted_ci_d_ops(struct inode *dir,
struct dentry *dentry);
extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry);
#ifdef CONFIG_MIGRATION
extern int buffer_migrate_page(struct address_space *,

View File

@ -36,7 +36,7 @@ struct fscrypt_name {
u32 hash;
u32 minor_hash;
struct fscrypt_str crypto_buf;
bool is_ciphertext_name;
bool is_nokey_name;
};
#define FSTR_INIT(n, l) { .name = n, .len = l }
@ -107,15 +107,15 @@ fscrypt_get_dummy_context(struct super_block *sb)
}
/*
* When d_splice_alias() moves a directory's encrypted alias to its decrypted
* alias as a result of the encryption key being added, DCACHE_ENCRYPTED_NAME
* must be cleared. Note that we don't have to support arbitrary moves of this
* flag because fscrypt doesn't allow encrypted aliases to be the source or
* target of a rename().
* When d_splice_alias() moves a directory's no-key alias to its plaintext alias
* as a result of the encryption key being added, DCACHE_NOKEY_NAME must be
* cleared. Note that we don't have to support arbitrary moves of this flag
* because fscrypt doesn't allow no-key names to be the source or target of a
* rename().
*/
static inline void fscrypt_handle_d_move(struct dentry *dentry)
{
dentry->d_flags &= ~DCACHE_ENCRYPTED_NAME;
dentry->d_flags &= ~DCACHE_NOKEY_NAME;
}
/* crypto.c */
@ -207,6 +207,7 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode,
bool fscrypt_match_name(const struct fscrypt_name *fname,
const u8 *de_name, u32 de_name_len);
u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name);
int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags);
/* bio.c */
void fscrypt_decrypt_bio(struct bio *bio);
@ -471,6 +472,12 @@ static inline u64 fscrypt_fname_siphash(const struct inode *dir,
return 0;
}
static inline int fscrypt_d_revalidate(struct dentry *dentry,
unsigned int flags)
{
return 1;
}
/* bio.c */
static inline void fscrypt_decrypt_bio(struct bio *bio)
{
@ -729,18 +736,19 @@ static inline int fscrypt_prepare_rename(struct inode *old_dir,
* @fname: (output) the name to use to search the on-disk directory
*
* Prepare for ->lookup() in a directory which may be encrypted by determining
* the name that will actually be used to search the directory on-disk. Lookups
* can be done with or without the directory's encryption key; without the key,
* filenames are presented in encrypted form. Therefore, we'll try to set up
* the directory's encryption key, but even without it the lookup can continue.
* the name that will actually be used to search the directory on-disk. If the
* directory's encryption key is available, then the lookup is assumed to be by
* plaintext name; otherwise, it is assumed to be by no-key name.
*
* After calling this function, a filesystem should ensure that it's dentry
* operations contain fscrypt_d_revalidate if DCACHE_ENCRYPTED_NAME was set,
* so that the dentry can be invalidated if the key is later added.
* This will set DCACHE_NOKEY_NAME on the dentry if the lookup is by no-key
* name. In this case the filesystem must assign the dentry a dentry_operations
* which contains fscrypt_d_revalidate (or contains a d_revalidate method that
* calls fscrypt_d_revalidate), so that the dentry will be invalidated if the
* directory's encryption key is later added.
*
* Return: 0 on success; -ENOENT if key is unavailable but the filename isn't a
* correctly formed encoded ciphertext name, so a negative dentry should be
* created; or another -errno code.
* Return: 0 on success; -ENOENT if the directory's key is unavailable but the
* filename isn't a valid no-key name, so a negative dentry should be created;
* or another -errno code.
*/
static inline int fscrypt_prepare_lookup(struct inode *dir,
struct dentry *dentry,

View File

@ -115,8 +115,13 @@ struct fsverity_operations {
static inline struct fsverity_info *fsverity_get_info(const struct inode *inode)
{
/* pairs with the cmpxchg() in fsverity_set_info() */
return READ_ONCE(inode->i_verity_info);
/*
* Pairs with the cmpxchg_release() in fsverity_set_info().
* I.e., another task may publish ->i_verity_info concurrently,
* executing a RELEASE barrier. We need to use smp_load_acquire() here
* to safely ACQUIRE the memory the other task published.
*/
return smp_load_acquire(&inode->i_verity_info);
}
/* enable.c */
@ -133,6 +138,10 @@ int fsverity_file_open(struct inode *inode, struct file *filp);
int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr);
void fsverity_cleanup_inode(struct inode *inode);
/* read_metadata.c */
int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg);
/* verify.c */
bool fsverity_verify_page(struct page *page);
@ -178,6 +187,14 @@ static inline void fsverity_cleanup_inode(struct inode *inode)
{
}
/* read_metadata.c */
static inline int fsverity_ioctl_read_metadata(struct file *filp,
const void __user *uarg)
{
return -EOPNOTSUPP;
}
/* verify.c */
static inline bool fsverity_verify_page(struct page *page)

View File

@ -49,7 +49,10 @@ static inline void mapping_set_error(struct address_space *mapping, int error)
return;
/* Record in wb_err for checkers using errseq_t based tracking */
filemap_set_wb_err(mapping, error);
__filemap_set_wb_err(mapping, error);
/* Record it in superblock */
errseq_set(&mapping->host->i_sb->s_wb_err, error);
/* Record it in flags for now, for legacy callers */
if (error == -ENOSPC)

View File

@ -6,6 +6,7 @@
#define _TRACE_F2FS_H
#include <linux/tracepoint.h>
#include <uapi/linux/f2fs.h>
#define show_dev(dev) MAJOR(dev), MINOR(dev)
#define show_dev_ino(entry) show_dev(entry->dev), (unsigned long)entry->ino
@ -121,13 +122,15 @@ TRACE_DEFINE_ENUM(CP_RESIZE);
#define show_alloc_mode(type) \
__print_symbolic(type, \
{ LFS, "LFS-mode" }, \
{ SSR, "SSR-mode" })
{ LFS, "LFS-mode" }, \
{ SSR, "SSR-mode" }, \
{ AT_SSR, "AT_SSR-mode" })
#define show_victim_policy(type) \
__print_symbolic(type, \
{ GC_GREEDY, "Greedy" }, \
{ GC_CB, "Cost-Benefit" })
{ GC_CB, "Cost-Benefit" }, \
{ GC_AT, "Age-threshold" })
#define show_cpreason(type) \
__print_flags(type, "|", \
@ -546,17 +549,17 @@ TRACE_EVENT(f2fs_truncate_partial_nodes,
TRACE_EVENT(f2fs_file_write_iter,
TP_PROTO(struct inode *inode, unsigned long offset,
unsigned long length, int ret),
TP_PROTO(struct inode *inode, loff_t offset, size_t length,
ssize_t ret),
TP_ARGS(inode, offset, length, ret),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(ino_t, ino)
__field(unsigned long, offset)
__field(unsigned long, length)
__field(int, ret)
__field(loff_t, offset)
__field(size_t, length)
__field(ssize_t, ret)
),
TP_fast_assign(
@ -568,7 +571,7 @@ TRACE_EVENT(f2fs_file_write_iter,
),
TP_printk("dev = (%d,%d), ino = %lu, "
"offset = %lu, length = %lu, written(err) = %d",
"offset = %lld, length = %zu, written(err) = %zd",
show_dev_ino(__entry),
__entry->offset,
__entry->length,
@ -576,9 +579,10 @@ TRACE_EVENT(f2fs_file_write_iter,
);
TRACE_EVENT(f2fs_map_blocks,
TP_PROTO(struct inode *inode, struct f2fs_map_blocks *map, int ret),
TP_PROTO(struct inode *inode, struct f2fs_map_blocks *map,
int create, int flag, int ret),
TP_ARGS(inode, map, ret),
TP_ARGS(inode, map, create, flag, ret),
TP_STRUCT__entry(
__field(dev_t, dev)
@ -589,11 +593,14 @@ TRACE_EVENT(f2fs_map_blocks,
__field(unsigned int, m_flags)
__field(int, m_seg_type)
__field(bool, m_may_create)
__field(bool, m_multidev_dio)
__field(int, create)
__field(int, flag)
__field(int, ret)
),
TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
__entry->dev = map->m_bdev->bd_dev;
__entry->ino = inode->i_ino;
__entry->m_lblk = map->m_lblk;
__entry->m_pblk = map->m_pblk;
@ -601,12 +608,16 @@ TRACE_EVENT(f2fs_map_blocks,
__entry->m_flags = map->m_flags;
__entry->m_seg_type = map->m_seg_type;
__entry->m_may_create = map->m_may_create;
__entry->m_multidev_dio = map->m_multidev_dio;
__entry->create = create;
__entry->flag = flag;
__entry->ret = ret;
),
TP_printk("dev = (%d,%d), ino = %lu, file offset = %llu, "
"start blkaddr = 0x%llx, len = 0x%llx, flags = %u,"
"seg_type = %d, may_create = %d, err = %d",
"start blkaddr = 0x%llx, len = 0x%llx, flags = %u, "
"seg_type = %d, may_create = %d, multidevice = %d, "
"create = %d, flag = %d, err = %d",
show_dev_ino(__entry),
(unsigned long long)__entry->m_lblk,
(unsigned long long)__entry->m_pblk,
@ -614,6 +625,9 @@ TRACE_EVENT(f2fs_map_blocks,
__entry->m_flags,
__entry->m_seg_type,
__entry->m_may_create,
__entry->m_multidev_dio,
__entry->create,
__entry->flag,
__entry->ret)
);
@ -813,20 +827,20 @@ TRACE_EVENT(f2fs_lookup_start,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(ino_t, ino)
__field(const char *, name)
__string(name, dentry->d_name.name)
__field(unsigned int, flags)
),
TP_fast_assign(
__entry->dev = dir->i_sb->s_dev;
__entry->ino = dir->i_ino;
__entry->name = dentry->d_name.name;
__assign_str(name, dentry->d_name.name);
__entry->flags = flags;
),
TP_printk("dev = (%d,%d), pino = %lu, name:%s, flags:%u",
show_dev_ino(__entry),
__entry->name,
__get_str(name),
__entry->flags)
);
@ -840,7 +854,7 @@ TRACE_EVENT(f2fs_lookup_end,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(ino_t, ino)
__field(const char *, name)
__string(name, dentry->d_name.name)
__field(nid_t, cino)
__field(int, err)
),
@ -848,14 +862,14 @@ TRACE_EVENT(f2fs_lookup_end,
TP_fast_assign(
__entry->dev = dir->i_sb->s_dev;
__entry->ino = dir->i_ino;
__entry->name = dentry->d_name.name;
__assign_str(name, dentry->d_name.name);
__entry->cino = ino;
__entry->err = err;
),
TP_printk("dev = (%d,%d), pino = %lu, name:%s, ino:%u, err:%d",
show_dev_ino(__entry),
__entry->name,
__get_str(name),
__entry->cino,
__entry->err)
);
@ -1824,6 +1838,7 @@ DEFINE_EVENT(f2fs_zip_end, f2fs_decompress_pages_end,
TP_ARGS(inode, cluster_idx, compressed_size, ret)
);
#ifdef CONFIG_F2FS_IOSTAT
TRACE_EVENT(f2fs_iostat,
TP_PROTO(struct f2fs_sb_info *sbi, unsigned long long *iostat),
@ -1900,6 +1915,165 @@ TRACE_EVENT(f2fs_iostat,
__entry->fs_cdrio, __entry->fs_nrio, __entry->fs_mrio)
);
#ifndef __F2FS_IOSTAT_LATENCY_TYPE
#define __F2FS_IOSTAT_LATENCY_TYPE
struct f2fs_iostat_latency {
unsigned int peak_lat;
unsigned int avg_lat;
unsigned int cnt;
};
#endif /* __F2FS_IOSTAT_LATENCY_TYPE */
TRACE_EVENT(f2fs_iostat_latency,
TP_PROTO(struct f2fs_sb_info *sbi, struct f2fs_iostat_latency (*iostat_lat)[NR_PAGE_TYPE]),
TP_ARGS(sbi, iostat_lat),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(unsigned int, d_rd_peak)
__field(unsigned int, d_rd_avg)
__field(unsigned int, d_rd_cnt)
__field(unsigned int, n_rd_peak)
__field(unsigned int, n_rd_avg)
__field(unsigned int, n_rd_cnt)
__field(unsigned int, m_rd_peak)
__field(unsigned int, m_rd_avg)
__field(unsigned int, m_rd_cnt)
__field(unsigned int, d_wr_s_peak)
__field(unsigned int, d_wr_s_avg)
__field(unsigned int, d_wr_s_cnt)
__field(unsigned int, n_wr_s_peak)
__field(unsigned int, n_wr_s_avg)
__field(unsigned int, n_wr_s_cnt)
__field(unsigned int, m_wr_s_peak)
__field(unsigned int, m_wr_s_avg)
__field(unsigned int, m_wr_s_cnt)
__field(unsigned int, d_wr_as_peak)
__field(unsigned int, d_wr_as_avg)
__field(unsigned int, d_wr_as_cnt)
__field(unsigned int, n_wr_as_peak)
__field(unsigned int, n_wr_as_avg)
__field(unsigned int, n_wr_as_cnt)
__field(unsigned int, m_wr_as_peak)
__field(unsigned int, m_wr_as_avg)
__field(unsigned int, m_wr_as_cnt)
),
TP_fast_assign(
__entry->dev = sbi->sb->s_dev;
__entry->d_rd_peak = iostat_lat[0][DATA].peak_lat;
__entry->d_rd_avg = iostat_lat[0][DATA].avg_lat;
__entry->d_rd_cnt = iostat_lat[0][DATA].cnt;
__entry->n_rd_peak = iostat_lat[0][NODE].peak_lat;
__entry->n_rd_avg = iostat_lat[0][NODE].avg_lat;
__entry->n_rd_cnt = iostat_lat[0][NODE].cnt;
__entry->m_rd_peak = iostat_lat[0][META].peak_lat;
__entry->m_rd_avg = iostat_lat[0][META].avg_lat;
__entry->m_rd_cnt = iostat_lat[0][META].cnt;
__entry->d_wr_s_peak = iostat_lat[1][DATA].peak_lat;
__entry->d_wr_s_avg = iostat_lat[1][DATA].avg_lat;
__entry->d_wr_s_cnt = iostat_lat[1][DATA].cnt;
__entry->n_wr_s_peak = iostat_lat[1][NODE].peak_lat;
__entry->n_wr_s_avg = iostat_lat[1][NODE].avg_lat;
__entry->n_wr_s_cnt = iostat_lat[1][NODE].cnt;
__entry->m_wr_s_peak = iostat_lat[1][META].peak_lat;
__entry->m_wr_s_avg = iostat_lat[1][META].avg_lat;
__entry->m_wr_s_cnt = iostat_lat[1][META].cnt;
__entry->d_wr_as_peak = iostat_lat[2][DATA].peak_lat;
__entry->d_wr_as_avg = iostat_lat[2][DATA].avg_lat;
__entry->d_wr_as_cnt = iostat_lat[2][DATA].cnt;
__entry->n_wr_as_peak = iostat_lat[2][NODE].peak_lat;
__entry->n_wr_as_avg = iostat_lat[2][NODE].avg_lat;
__entry->n_wr_as_cnt = iostat_lat[2][NODE].cnt;
__entry->m_wr_as_peak = iostat_lat[2][META].peak_lat;
__entry->m_wr_as_avg = iostat_lat[2][META].avg_lat;
__entry->m_wr_as_cnt = iostat_lat[2][META].cnt;
),
TP_printk("dev = (%d,%d), "
"iotype [peak lat.(ms)/avg lat.(ms)/count], "
"rd_data [%u/%u/%u], rd_node [%u/%u/%u], rd_meta [%u/%u/%u], "
"wr_sync_data [%u/%u/%u], wr_sync_node [%u/%u/%u], "
"wr_sync_meta [%u/%u/%u], wr_async_data [%u/%u/%u], "
"wr_async_node [%u/%u/%u], wr_async_meta [%u/%u/%u]",
show_dev(__entry->dev),
__entry->d_rd_peak, __entry->d_rd_avg, __entry->d_rd_cnt,
__entry->n_rd_peak, __entry->n_rd_avg, __entry->n_rd_cnt,
__entry->m_rd_peak, __entry->m_rd_avg, __entry->m_rd_cnt,
__entry->d_wr_s_peak, __entry->d_wr_s_avg, __entry->d_wr_s_cnt,
__entry->n_wr_s_peak, __entry->n_wr_s_avg, __entry->n_wr_s_cnt,
__entry->m_wr_s_peak, __entry->m_wr_s_avg, __entry->m_wr_s_cnt,
__entry->d_wr_as_peak, __entry->d_wr_as_avg, __entry->d_wr_as_cnt,
__entry->n_wr_as_peak, __entry->n_wr_as_avg, __entry->n_wr_as_cnt,
__entry->m_wr_as_peak, __entry->m_wr_as_avg, __entry->m_wr_as_cnt)
);
#endif
TRACE_EVENT(f2fs_bmap,
TP_PROTO(struct inode *inode, sector_t lblock, sector_t pblock),
TP_ARGS(inode, lblock, pblock),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(ino_t, ino)
__field(sector_t, lblock)
__field(sector_t, pblock)
),
TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
__entry->ino = inode->i_ino;
__entry->lblock = lblock;
__entry->pblock = pblock;
),
TP_printk("dev = (%d,%d), ino = %lu, lblock:%lld, pblock:%lld",
show_dev_ino(__entry),
(unsigned long long)__entry->lblock,
(unsigned long long)__entry->pblock)
);
TRACE_EVENT(f2fs_fiemap,
TP_PROTO(struct inode *inode, sector_t lblock, sector_t pblock,
unsigned long long len, unsigned int flags, int ret),
TP_ARGS(inode, lblock, pblock, len, flags, ret),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(ino_t, ino)
__field(sector_t, lblock)
__field(sector_t, pblock)
__field(unsigned long long, len)
__field(unsigned int, flags)
__field(int, ret)
),
TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
__entry->ino = inode->i_ino;
__entry->lblock = lblock;
__entry->pblock = pblock;
__entry->len = len;
__entry->flags = flags;
__entry->ret = ret;
),
TP_printk("dev = (%d,%d), ino = %lu, lblock:%lld, pblock:%lld, "
"len:%llu, flags:%u, ret:%d",
show_dev_ino(__entry),
(unsigned long long)__entry->lblock,
(unsigned long long)__entry->pblock,
__entry->len,
__entry->flags,
__entry->ret)
);
#endif /* _TRACE_F2FS_H */
/* This part must be outside protection */

98
include/uapi/linux/f2fs.h Normal file
View File

@ -0,0 +1,98 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _UAPI_LINUX_F2FS_H
#define _UAPI_LINUX_F2FS_H
#include <linux/types.h>
#include <linux/ioctl.h>
/*
* f2fs-specific ioctl commands
*/
#define F2FS_IOCTL_MAGIC 0xf5
#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1)
#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2)
#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3)
#define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4)
#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5)
#define F2FS_IOC_GARBAGE_COLLECT _IOW(F2FS_IOCTL_MAGIC, 6, __u32)
#define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7)
#define F2FS_IOC_DEFRAGMENT _IOWR(F2FS_IOCTL_MAGIC, 8, \
struct f2fs_defragment)
#define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \
struct f2fs_move_range)
#define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \
struct f2fs_flush_device)
#define F2FS_IOC_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11, \
struct f2fs_gc_range)
#define F2FS_IOC_GET_FEATURES _IOR(F2FS_IOCTL_MAGIC, 12, __u32)
#define F2FS_IOC_SET_PIN_FILE _IOW(F2FS_IOCTL_MAGIC, 13, __u32)
#define F2FS_IOC_GET_PIN_FILE _IOR(F2FS_IOCTL_MAGIC, 14, __u32)
#define F2FS_IOC_PRECACHE_EXTENTS _IO(F2FS_IOCTL_MAGIC, 15)
#define F2FS_IOC_RESIZE_FS _IOW(F2FS_IOCTL_MAGIC, 16, __u64)
#define F2FS_IOC_GET_COMPRESS_BLOCKS _IOR(F2FS_IOCTL_MAGIC, 17, __u64)
#define F2FS_IOC_RELEASE_COMPRESS_BLOCKS \
_IOR(F2FS_IOCTL_MAGIC, 18, __u64)
#define F2FS_IOC_RESERVE_COMPRESS_BLOCKS \
_IOR(F2FS_IOCTL_MAGIC, 19, __u64)
#define F2FS_IOC_SEC_TRIM_FILE _IOW(F2FS_IOCTL_MAGIC, 20, \
struct f2fs_sectrim_range)
#define F2FS_IOC_GET_COMPRESS_OPTION _IOR(F2FS_IOCTL_MAGIC, 21, \
struct f2fs_comp_option)
#define F2FS_IOC_SET_COMPRESS_OPTION _IOW(F2FS_IOCTL_MAGIC, 22, \
struct f2fs_comp_option)
#define F2FS_IOC_DECOMPRESS_FILE _IO(F2FS_IOCTL_MAGIC, 23)
#define F2FS_IOC_COMPRESS_FILE _IO(F2FS_IOCTL_MAGIC, 24)
/*
* should be same as XFS_IOC_GOINGDOWN.
* Flags for going down operation used by FS_IOC_GOINGDOWN
*/
#define F2FS_IOC_SHUTDOWN _IOR('X', 125, __u32) /* Shutdown */
#define F2FS_GOING_DOWN_FULLSYNC 0x0 /* going down with full sync */
#define F2FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */
#define F2FS_GOING_DOWN_NOSYNC 0x2 /* going down */
#define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */
#define F2FS_GOING_DOWN_NEED_FSCK 0x4 /* going down to trigger fsck */
/*
* Flags used by F2FS_IOC_SEC_TRIM_FILE
*/
#define F2FS_TRIM_FILE_DISCARD 0x1 /* send discard command */
#define F2FS_TRIM_FILE_ZEROOUT 0x2 /* zero out */
#define F2FS_TRIM_FILE_MASK 0x3
struct f2fs_gc_range {
__u32 sync;
__u64 start;
__u64 len;
};
struct f2fs_defragment {
__u64 start;
__u64 len;
};
struct f2fs_move_range {
__u32 dst_fd; /* destination fd */
__u64 pos_in; /* start position in src_fd */
__u64 pos_out; /* start position in dst_fd */
__u64 len; /* size to move */
};
struct f2fs_flush_device {
__u32 dev_num; /* device number to flush */
__u32 segments; /* # of segments to flush */
};
struct f2fs_sectrim_range {
__u64 start;
__u64 len;
__u64 flags;
};
struct f2fs_comp_option {
__u8 algorithm;
__u8 log_cluster_size;
};
#endif /* _UAPI_LINUX_F2FS_H */

View File

@ -34,7 +34,70 @@ struct fsverity_digest {
__u8 digest[];
};
/*
* Struct containing a file's Merkle tree properties. The fs-verity file digest
* is the hash of this struct. A userspace program needs this struct only if it
* needs to compute fs-verity file digests itself, e.g. in order to sign files.
* It isn't needed just to enable fs-verity on a file.
*
* Note: when computing the file digest, 'sig_size' and 'signature' must be left
* zero and empty, respectively. These fields are present only because some
* filesystems reuse this struct as part of their on-disk format.
*/
struct fsverity_descriptor {
__u8 version; /* must be 1 */
__u8 hash_algorithm; /* Merkle tree hash algorithm */
__u8 log_blocksize; /* log2 of size of data and tree blocks */
__u8 salt_size; /* size of salt in bytes; 0 if none */
#ifdef __KERNEL__
__le32 sig_size;
#else
__le32 __reserved_0x04; /* must be 0 */
#endif
__le64 data_size; /* size of file the Merkle tree is built over */
__u8 root_hash[64]; /* Merkle tree root hash */
__u8 salt[32]; /* salt prepended to each hashed block */
__u8 __reserved[144]; /* must be 0's */
#ifdef __KERNEL__
__u8 signature[];
#endif
};
/*
* Format in which fs-verity file digests are signed in built-in signatures.
* This is the same as 'struct fsverity_digest', except here some magic bytes
* are prepended to provide some context about what is being signed in case the
* same key is used for non-fsverity purposes, and here the fields have fixed
* endianness.
*
* This struct is specific to the built-in signature verification support, which
* is optional. fs-verity users may also verify signatures in userspace, in
* which case userspace is responsible for deciding on what bytes are signed.
* This struct may still be used, but it doesn't have to be. For example,
* userspace could instead use a string like "sha256:$digest_as_hex_string".
*/
struct fsverity_formatted_digest {
char magic[8]; /* must be "FSVerity" */
__le16 digest_algorithm;
__le16 digest_size;
__u8 digest[];
};
#define FS_VERITY_METADATA_TYPE_MERKLE_TREE 1
#define FS_VERITY_METADATA_TYPE_DESCRIPTOR 2
#define FS_VERITY_METADATA_TYPE_SIGNATURE 3
struct fsverity_read_metadata_arg {
__u64 metadata_type;
__u64 offset;
__u64 length;
__u64 buf_ptr;
__u64 __reserved;
};
#define FS_IOC_ENABLE_VERITY _IOW('f', 133, struct fsverity_enable_arg)
#define FS_IOC_MEASURE_VERITY _IOWR('f', 134, struct fsverity_digest)
#define FS_IOC_READ_VERITY_METADATA \
_IOWR('f', 135, struct fsverity_read_metadata_arg)
#endif /* _UAPI_LINUX_FSVERITY_H */

View File

@ -179,8 +179,9 @@ int generic_swapfile_activate(struct swap_info_struct *sis,
cond_resched();
first_block = bmap(inode, probe_block);
if (first_block == 0)
first_block = probe_block;
ret = bmap(inode, &first_block);
if (ret || !first_block)
goto bad_bmap;
/*
@ -195,9 +196,11 @@ int generic_swapfile_activate(struct swap_info_struct *sis,
block_in_page++) {
sector_t block;
block = bmap(inode, probe_block + block_in_page);
if (block == 0)
block = probe_block + block_in_page;
ret = bmap(inode, &block);
if (ret || !block)
goto bad_bmap;
if (block != first_block + block_in_page) {
/* Discontiguity */
probe_block++;