diff mbox series

[v3,3/3] scripts: add filev2p.py script for mapping virtual file offsets mapping

Message ID 20240716144123.651476-4-andrey.drobyshev@virtuozzo.com (mailing list archive)
State New, archived
Headers show
Series Fix data corruption within preallocation | expand

Commit Message

Andrey Drobyshev July 16, 2024, 2:41 p.m. UTC
The script is basically a wrapper around "filefrag" utility.  This might
be used to map virtual offsets within the file to the underlying block
device offsets.  In addition, a chunk size might be specified, in which
case a list of such mappings will be obtained:

$ scripts/filev2p.py -s 100M /sparsefile 1768M
1853882368..1895825407 (file)  ->  16332619776..16374562815 (/dev/sda4)  ->  84492156928..84534099967 (/dev/sda)
1895825408..1958739967 (file)  ->  17213591552..17276506111 (/dev/sda4)  ->  85373128704..85436043263 (/dev/sda)

This could come in handy when we need to map a certain piece of data
within a file inside VM to the same data within the image on the host
(e.g. physical offset on VM's /dev/sda would be the virtual offset
within QCOW2 image).

Note: as of now the script only works with the files located on plain
partitions, i.e. it doesn't work with partitions built on top of LVM.
Partitions on LVM would require another level of mapping.

Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
---
 scripts/filev2p.py | 311 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 311 insertions(+)
 create mode 100755 scripts/filev2p.py

Comments

Kevin Wolf Aug. 5, 2024, 12:29 p.m. UTC | #1
Am 16.07.2024 um 16:41 hat Andrey Drobyshev geschrieben:
> The script is basically a wrapper around "filefrag" utility.  This might
> be used to map virtual offsets within the file to the underlying block
> device offsets.  In addition, a chunk size might be specified, in which
> case a list of such mappings will be obtained:
> 
> $ scripts/filev2p.py -s 100M /sparsefile 1768M
> 1853882368..1895825407 (file)  ->  16332619776..16374562815 (/dev/sda4)  ->  84492156928..84534099967 (/dev/sda)
> 1895825408..1958739967 (file)  ->  17213591552..17276506111 (/dev/sda4)  ->  85373128704..85436043263 (/dev/sda)
> 
> This could come in handy when we need to map a certain piece of data
> within a file inside VM to the same data within the image on the host
> (e.g. physical offset on VM's /dev/sda would be the virtual offset
> within QCOW2 image).
> 
> Note: as of now the script only works with the files located on plain
> partitions, i.e. it doesn't work with partitions built on top of LVM.
> Partitions on LVM would require another level of mapping.
> 
> Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
> ---
>  scripts/filev2p.py | 311 +++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 311 insertions(+)
>  create mode 100755 scripts/filev2p.py
> 
> diff --git a/scripts/filev2p.py b/scripts/filev2p.py
> new file mode 100755
> index 0000000000..3bd7d18b5e
> --- /dev/null
> +++ b/scripts/filev2p.py
> @@ -0,0 +1,311 @@
> +#!/usr/bin/env python3
> +#
> +# Map file virtual offset to the offset on the underlying block device.
> +# Works by parsing 'filefrag' output.
> +#
> +# Copyright (c) 2024 Virtuozzo International GmbH.
> +#
> +# This program is free software; you can redistribute it and/or modify
> +# it under the terms of the GNU General Public License as published by
> +# the Free Software Foundation; either version 2 of the License, or
> +# (at your option) any later version.
> +#
> +# This program is distributed in the hope that it will be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +# GNU General Public License for more details.
> +#
> +# You should have received a copy of the GNU General Public License
> +# along with this program.  If not, see <http://www.gnu.org/licenses/>.
> +#
> +
> +import argparse
> +import os
> +import subprocess
> +import re
> +import sys
> +
> +from bisect import bisect_right
> +from collections import namedtuple
> +from dataclasses import dataclass
> +from shutil import which
> +from stat import S_ISBLK
> +
> +
> +Partition = namedtuple('Partition', ['partpath', 'diskpath', 'part_offt'])
> +
> +
> +@dataclass
> +class Extent:
> +    '''Class representing an individual file extent.
> +
> +    This is basically a piece of data within the file which is located
> +    consecutively (i.e. not sparsely) on the underlying block device.
> +    '''

Python docstrings should always be triple double quotes """...""" as per
PEP 257.

Some functions below even use a single single quote because they are on
a single line. They should still use the same convention.

> +
> +    log_start:  int
> +    log_end:    int
> +    phys_start: int
> +    phys_end:   int
> +    length:     int
> +    partition:  Partition
> +
> +    @property
> +    def disk_start(self):
> +        'Number of the first byte of this extent on the whole disk (/dev/sda)'
> +        return self.partition.part_offt + self.phys_start
> +
> +    @property
> +    def disk_end(self):
> +        'Number of the last byte of this extent on the whole disk (/dev/sda)'
> +        return self.partition.part_offt + self.phys_end
> +
> +    def __str__(self):
> +        ischunk = self.log_end > self.log_start
> +        maybe_end = lambda s: f'..{s}' if ischunk else ''
> +        return '%s%s (file)  ->  %s%s (%s)  ->  %s%s (%s)' % (
> +            self.log_start, maybe_end(self.log_end),
> +            self.phys_start, maybe_end(self.phys_end), self.partition.partpath,
> +            self.disk_start, maybe_end(self.disk_end), self.partition.diskpath
> +        )
> +
> +    @classmethod
> +    def ext_slice(cls, bigger_ext, start, end):
> +        '''Constructor for the Extent class from a bigger extent.
> +
> +        Return Extent instance which is a slice of @bigger_ext contained
> +        within the range [start, end].
> +        '''
> +
> +        assert start >= bigger_ext.log_start
> +        assert end <= bigger_ext.log_end
> +
> +        if start == bigger_ext.log_start and end == bigger_ext.log_end:
> +            return bigger_ext
> +
> +        phys_start = bigger_ext.phys_start + (start - bigger_ext.log_start)
> +        phys_end = bigger_ext.phys_end - (bigger_ext.log_end - end)
> +        length = end - start + 1
> +
> +        return cls(start, end, phys_start, phys_end, length,
> +                   bigger_ext.partition)
> +
> +
> +def run_cmd(cmd: str) -> str:
> +    '''Wrapper around subprocess.run.
> +
> +    Returns stdout in case of success, emits en error and exits in case
> +    of failure.
> +    '''
> +
> +    proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
> +                          check=False, shell=True)
> +    if proc.stderr is not None:
> +        stderr = f'\n{proc.stderr.decode().strip()}'
> +    else:
> +        stderr = ''
> +
> +    if proc.returncode:
> +        sys.exit(f'Error: Command "{cmd}" returned {proc.returncode}:{stderr}')
> +
> +    return proc.stdout.decode().strip()
> +
> +
> +def parse_size(offset: str) -> int:
> +    'Convert human readable size to bytes'
> +
> +    suffixes = {
> +        **dict.fromkeys(['k', 'K', 'Kb', 'KB', 'KiB'], 2 ** 10),
> +        **dict.fromkeys(['m', 'M', 'Mb', 'MB', 'MiB'], 2 ** 20),
> +        **dict.fromkeys(['g', 'G', 'Gb', 'GB', 'GiB'], 2 ** 30),
> +        **dict.fromkeys(     ['T', 'Tb', 'TB', 'TiB'], 2 ** 40),
> +        **dict.fromkeys([''],                          1)
> +    }
> +
> +    sizematch = re.match(r'^([0-9]+)\s*([a-zA-Z]*)$', offset)
> +    if not bool(sizematch):
> +        sys.exit(f'Error: Couldn\'t parse size "{offset}". Pass offset '
> +                  'either in bytes or in format 1K, 2M, 3G')
> +
> +    num, suff = sizematch.groups()
> +    num = int(num)
> +
> +    mult = suffixes.get(suff)
> +    if mult is None:
> +        sys.exit(f'Error: Couldn\'t parse size "{offset}": '
> +                 f'unknown suffix {suff}')
> +
> +    return num * mult
> +
> +
> +def fpath2part(filename: str) -> str:
> +    'Get partition on which @filename is located (i.e. /dev/sda1).'
> +
> +    partpath = run_cmd(f'df --output=source {filename} | tail -n+2')

Anything passed to a shell (like {filename}) certainly must have proper
quoting applied to avoid shell injections?

> +    if not os.path.exists(partpath) or not S_ISBLK(os.stat(partpath).st_mode):
> +        sys.exit(f'Error: file {filename} is located on {partpath} which '
> +                 'isn\'t a block device')
> +    return partpath
> +
> +
> +def part2dev(partpath: str, filename: str) -> str:
> +    'Get block device on which @partpath is located (i.e. /dev/sda).'
> +    dev = run_cmd(f'lsblk -no PKNAME {partpath}')

Missing quoting here, too.

> +    diskpath = f'/dev/{dev}'
> +    if not os.path.exists(diskpath) or not S_ISBLK(os.stat(diskpath).st_mode):
> +        sys.exit(f'Error: file {filename} is located on {diskpath} which '
> +                 'isn\'t a block device')
> +    return diskpath
> +
> +
> +def part2disktype(partpath: str) -> str:
> +    'Parse /proc/devices and get block device type for @partpath'
> +
> +    major = os.major(os.stat(partpath).st_rdev)
> +    assert major
> +    with open('/proc/devices', encoding='utf-8') as devf:
> +        for line in reversed(list(devf)):
> +            # Our major cannot be absent among block devs
> +            if line.startswith('Block'):
> +                break
> +            devmajor, devtype = line.strip().split()
> +            if int(devmajor) == major:
> +                return devtype
> +
> +    sys.exit('Error: We haven\'t found major {major} in /proc/devices, '
> +             'and that can\'t be')
> +
> +
> +def get_part_offset(part: str, disk: str) -> int:
> +    'Get offset in bytes of the partition @part on the block device @disk.'
> +
> +    lines = run_cmd(f'fdisk -l {disk} | egrep "^(Units|{part})"').splitlines()

And here.

We should probably also match a space after {part} to avoid selecting
other partitions that have {part} as a prefix (like partition 10 when we
want partition 1). I think we would actually always get the wanted one
first, but it would be cleaner to not even have the others in the
output.

> +
> +    unitmatch = re.match('^.* = ([0-9]+) bytes$', lines[0])
> +    if not bool(unitmatch):
> +        sys.exit(f'Error: Couldn\'t parse "fdisk -l" output:\n{lines[0]}')
> +    secsize = int(unitmatch.group(1))
> +
> +    part_offt = int(lines[1].split()[1])
> +    return part_offt * secsize
> +
> +
> +def parse_frag_line(line: str, partition: Partition) -> Extent:
> +    'Construct Extent instance from a "filefrag" output line.'
> +
> +    nums = [int(n) for n in re.findall(r'[0-9]+', line)]
> +
> +    log_start  = nums[1]
> +    log_end    = nums[2]
> +    phys_start = nums[3]
> +    phys_end   = nums[4]
> +    length     = nums[5]
> +
> +    assert log_start < log_end
> +    assert phys_start < phys_end
> +    assert (log_end - log_start + 1) == (phys_end - phys_start + 1) == length
> +
> +    return Extent(log_start, log_end, phys_start, phys_end, length, partition)
> +
> +
> +def preliminary_checks(args: argparse.Namespace) -> None:
> +    'A bunch of checks to emit an error and exit at the earlier stage.'
> +
> +    if which('filefrag') is None:
> +        sys.exit('Error: Program "filefrag" doesn\'t exist')
> +
> +    if not os.path.exists(args.filename):
> +        sys.exit(f'Error: File {args.filename} doesn\'t exist')
> +
> +    args.filesize = os.path.getsize(args.filename)
> +    if args.offset >= args.filesize:
> +        sys.exit(f'Error: Specified offset {args.offset} exceeds '
> +                 f'file size {args.filesize}')
> +    if args.size and (args.offset + args.size > args.filesize):
> +        sys.exit(f'Error: Chunk of size {args.size} at offset '
> +                 f'{args.offset} exceeds file size {args.filesize}')
> +
> +    args.partpath = fpath2part(args.filename)
> +    args.disktype = part2disktype(args.partpath)
> +    if args.disktype not in ('sd', 'virtblk'):
> +        sys.exit(f'Error: Cannot analyze files on {args.disktype} disks')
> +    args.diskpath = part2dev(args.partpath, args.filename)
> +    args.part_offt = get_part_offset(args.partpath, args.diskpath)
> +
> +
> +def get_extent_maps(args: argparse.Namespace) -> list[Extent]:
> +    'Run "filefrag", parse its output and return a list of Extent instances.'
> +
> +    lines = run_cmd(f'filefrag -b1 -v {args.filename}').splitlines()

And the final missing quoting.

> +
> +    ffinfo_re = re.compile('.* is ([0-9]+) .*of ([0-9]+) bytes')
> +    ff_size, ff_block = re.match(ffinfo_re, lines[1]).groups()
> +
> +    # Paranoia checks
> +    if int(ff_size) != args.filesize:
> +        sys.exit('Error: filefrag and os.path.getsize() report different '
> +                 f'sizes: {ff_size} and {args.filesize}')
> +    if int(ff_block) != 1:
> +        sys.exit(f'Error: "filefrag -b1" invoked, but block size is {ff_block}')
> +
> +    partition = Partition(args.partpath, args.diskpath, args.part_offt)
> +
> +    # Fill extents list from the output
> +    extents = []
> +    for line in lines:
> +        if not re.match(r'^\s*[0-9]+:', line):
> +            continue
> +        extents += [parse_frag_line(line, partition)]
> +
> +    chunk_start = args.offset
> +    chunk_end = args.offset + args.size - 1
> +    ext_offsets = [ext.log_start for ext in extents]
> +    start_ind = bisect_right(ext_offsets, chunk_start) - 1
> +    end_ind = bisect_right(ext_offsets, chunk_end) - 1
> +
> +    res_extents = extents[start_ind : end_ind + 1]
> +    for i, ext in enumerate(res_extents):
> +        start = max(chunk_start, ext.log_start)
> +        end = min(chunk_end, ext.log_end)
> +        res_extents[i] = Extent.ext_slice(ext, start, end)
> +
> +    return res_extents
> +
> +
> +def parse_args() -> argparse.Namespace:
> +    'Define program arguments and parse user input.'
> +
> +    parser = argparse.ArgumentParser(description='''
> +Map file offset to physical offset on the block device
> +
> +With --size provided get a list of mappings for the chunk''',
> +    formatter_class=argparse.RawTextHelpFormatter)
> +
> +    parser.add_argument('filename', type=str, help='filename to process')
> +    parser.add_argument('offset', type=str,
> +                        help='logical offset inside the file')
> +    parser.add_argument('-s', '--size', required=False, type=str,
> +                        help='size of the file chunk to get offsets for')
> +    args = parser.parse_args()
> +
> +    args.offset = parse_size(args.offset)
> +    if args.size:
> +        args.size = parse_size(args.size)
> +    else:
> +        # When no chunk size is provided (only offset), it's equivalent to
> +        # chunk size == 1
> +        args.size = 1
> +
> +    return args
> +
> +
> +def main() -> int:
> +    args = parse_args()
> +    preliminary_checks(args)
> +    extents = get_extent_maps(args)
> +    for ext in extents:
> +        print(ext)
> +
> +
> +if __name__ == '__main__':
> +    sys.exit(main())

Kevin
Andrey Drobyshev Aug. 5, 2024, 1:02 p.m. UTC | #2
On 8/5/24 3:29 PM, Kevin Wolf wrote:
> Am 16.07.2024 um 16:41 hat Andrey Drobyshev geschrieben:
>> The script is basically a wrapper around "filefrag" utility.  This might
>> be used to map virtual offsets within the file to the underlying block
>> device offsets.  In addition, a chunk size might be specified, in which
>> case a list of such mappings will be obtained:
>>
>> $ scripts/filev2p.py -s 100M /sparsefile 1768M
>> 1853882368..1895825407 (file)  ->  16332619776..16374562815 (/dev/sda4)  ->  84492156928..84534099967 (/dev/sda)
>> 1895825408..1958739967 (file)  ->  17213591552..17276506111 (/dev/sda4)  ->  85373128704..85436043263 (/dev/sda)
>>
>> This could come in handy when we need to map a certain piece of data
>> within a file inside VM to the same data within the image on the host
>> (e.g. physical offset on VM's /dev/sda would be the virtual offset
>> within QCOW2 image).
>>
>> Note: as of now the script only works with the files located on plain
>> partitions, i.e. it doesn't work with partitions built on top of LVM.
>> Partitions on LVM would require another level of mapping.
>>
>> Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
>> ---
>>  scripts/filev2p.py | 311 +++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 311 insertions(+)
>>  create mode 100755 scripts/filev2p.py
>>
>> diff --git a/scripts/filev2p.py b/scripts/filev2p.py
>> new file mode 100755
>> index 0000000000..3bd7d18b5e
>> --- /dev/null
>> +++ b/scripts/filev2p.py
>> @@ -0,0 +1,311 @@
>> +#!/usr/bin/env python3
>> +#
>> +# Map file virtual offset to the offset on the underlying block device.
>> +# Works by parsing 'filefrag' output.
>> +#
>> +# Copyright (c) 2024 Virtuozzo International GmbH.
>> +#
>> +# This program is free software; you can redistribute it and/or modify
>> +# it under the terms of the GNU General Public License as published by
>> +# the Free Software Foundation; either version 2 of the License, or
>> +# (at your option) any later version.
>> +#
>> +# This program is distributed in the hope that it will be useful,
>> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> +# GNU General Public License for more details.
>> +#
>> +# You should have received a copy of the GNU General Public License
>> +# along with this program.  If not, see <http://www.gnu.org/licenses/>.
>> +#
>> +
>> +import argparse
>> +import os
>> +import subprocess
>> +import re
>> +import sys
>> +
>> +from bisect import bisect_right
>> +from collections import namedtuple
>> +from dataclasses import dataclass
>> +from shutil import which
>> +from stat import S_ISBLK
>> +
>> +
>> +Partition = namedtuple('Partition', ['partpath', 'diskpath', 'part_offt'])
>> +
>> +
>> +@dataclass
>> +class Extent:
>> +    '''Class representing an individual file extent.
>> +
>> +    This is basically a piece of data within the file which is located
>> +    consecutively (i.e. not sparsely) on the underlying block device.
>> +    '''
> 
> Python docstrings should always be triple double quotes """...""" as per
> PEP 257.
> 
> Some functions below even use a single single quote because they are on
> a single line. They should still use the same convention.
> 
>> +
>> +    log_start:  int
>> +    log_end:    int
>> +    phys_start: int
>> +    phys_end:   int
>> +    length:     int
>> +    partition:  Partition
>> +
>> +    @property
>> +    def disk_start(self):
>> +        'Number of the first byte of this extent on the whole disk (/dev/sda)'
>> +        return self.partition.part_offt + self.phys_start
>> +
>> +    @property
>> +    def disk_end(self):
>> +        'Number of the last byte of this extent on the whole disk (/dev/sda)'
>> +        return self.partition.part_offt + self.phys_end
>> +
>> +    def __str__(self):
>> +        ischunk = self.log_end > self.log_start
>> +        maybe_end = lambda s: f'..{s}' if ischunk else ''
>> +        return '%s%s (file)  ->  %s%s (%s)  ->  %s%s (%s)' % (
>> +            self.log_start, maybe_end(self.log_end),
>> +            self.phys_start, maybe_end(self.phys_end), self.partition.partpath,
>> +            self.disk_start, maybe_end(self.disk_end), self.partition.diskpath
>> +        )
>> +
>> +    @classmethod
>> +    def ext_slice(cls, bigger_ext, start, end):
>> +        '''Constructor for the Extent class from a bigger extent.
>> +
>> +        Return Extent instance which is a slice of @bigger_ext contained
>> +        within the range [start, end].
>> +        '''
>> +
>> +        assert start >= bigger_ext.log_start
>> +        assert end <= bigger_ext.log_end
>> +
>> +        if start == bigger_ext.log_start and end == bigger_ext.log_end:
>> +            return bigger_ext
>> +
>> +        phys_start = bigger_ext.phys_start + (start - bigger_ext.log_start)
>> +        phys_end = bigger_ext.phys_end - (bigger_ext.log_end - end)
>> +        length = end - start + 1
>> +
>> +        return cls(start, end, phys_start, phys_end, length,
>> +                   bigger_ext.partition)
>> +
>> +
>> +def run_cmd(cmd: str) -> str:
>> +    '''Wrapper around subprocess.run.
>> +
>> +    Returns stdout in case of success, emits en error and exits in case
>> +    of failure.
>> +    '''
>> +
>> +    proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
>> +                          check=False, shell=True)
>> +    if proc.stderr is not None:
>> +        stderr = f'\n{proc.stderr.decode().strip()}'
>> +    else:
>> +        stderr = ''
>> +
>> +    if proc.returncode:
>> +        sys.exit(f'Error: Command "{cmd}" returned {proc.returncode}:{stderr}')
>> +
>> +    return proc.stdout.decode().strip()
>> +
>> +
>> +def parse_size(offset: str) -> int:
>> +    'Convert human readable size to bytes'
>> +
>> +    suffixes = {
>> +        **dict.fromkeys(['k', 'K', 'Kb', 'KB', 'KiB'], 2 ** 10),
>> +        **dict.fromkeys(['m', 'M', 'Mb', 'MB', 'MiB'], 2 ** 20),
>> +        **dict.fromkeys(['g', 'G', 'Gb', 'GB', 'GiB'], 2 ** 30),
>> +        **dict.fromkeys(     ['T', 'Tb', 'TB', 'TiB'], 2 ** 40),
>> +        **dict.fromkeys([''],                          1)
>> +    }
>> +
>> +    sizematch = re.match(r'^([0-9]+)\s*([a-zA-Z]*)$', offset)
>> +    if not bool(sizematch):
>> +        sys.exit(f'Error: Couldn\'t parse size "{offset}". Pass offset '
>> +                  'either in bytes or in format 1K, 2M, 3G')
>> +
>> +    num, suff = sizematch.groups()
>> +    num = int(num)
>> +
>> +    mult = suffixes.get(suff)
>> +    if mult is None:
>> +        sys.exit(f'Error: Couldn\'t parse size "{offset}": '
>> +                 f'unknown suffix {suff}')
>> +
>> +    return num * mult
>> +
>> +
>> +def fpath2part(filename: str) -> str:
>> +    'Get partition on which @filename is located (i.e. /dev/sda1).'
>> +
>> +    partpath = run_cmd(f'df --output=source {filename} | tail -n+2')
> 
> Anything passed to a shell (like {filename}) certainly must have proper
> quoting applied to avoid shell injections?
> 
>> +    if not os.path.exists(partpath) or not S_ISBLK(os.stat(partpath).st_mode):
>> +        sys.exit(f'Error: file {filename} is located on {partpath} which '
>> +                 'isn\'t a block device')
>> +    return partpath
>> +
>> +
>> +def part2dev(partpath: str, filename: str) -> str:
>> +    'Get block device on which @partpath is located (i.e. /dev/sda).'
>> +    dev = run_cmd(f'lsblk -no PKNAME {partpath}')
> 
> Missing quoting here, too.
> 
>> +    diskpath = f'/dev/{dev}'
>> +    if not os.path.exists(diskpath) or not S_ISBLK(os.stat(diskpath).st_mode):
>> +        sys.exit(f'Error: file {filename} is located on {diskpath} which '
>> +                 'isn\'t a block device')
>> +    return diskpath
>> +
>> +
>> +def part2disktype(partpath: str) -> str:
>> +    'Parse /proc/devices and get block device type for @partpath'
>> +
>> +    major = os.major(os.stat(partpath).st_rdev)
>> +    assert major
>> +    with open('/proc/devices', encoding='utf-8') as devf:
>> +        for line in reversed(list(devf)):
>> +            # Our major cannot be absent among block devs
>> +            if line.startswith('Block'):
>> +                break
>> +            devmajor, devtype = line.strip().split()
>> +            if int(devmajor) == major:
>> +                return devtype
>> +
>> +    sys.exit('Error: We haven\'t found major {major} in /proc/devices, '
>> +             'and that can\'t be')
>> +
>> +
>> +def get_part_offset(part: str, disk: str) -> int:
>> +    'Get offset in bytes of the partition @part on the block device @disk.'
>> +
>> +    lines = run_cmd(f'fdisk -l {disk} | egrep "^(Units|{part})"').splitlines()
> 
> And here.
> 
> We should probably also match a space after {part} to avoid selecting
> other partitions that have {part} as a prefix (like partition 10 when we
> want partition 1). I think we would actually always get the wanted one
> first, but it would be cleaner to not even have the others in the
> output.
> 
>> +
>> +    unitmatch = re.match('^.* = ([0-9]+) bytes$', lines[0])
>> +    if not bool(unitmatch):
>> +        sys.exit(f'Error: Couldn\'t parse "fdisk -l" output:\n{lines[0]}')
>> +    secsize = int(unitmatch.group(1))
>> +
>> +    part_offt = int(lines[1].split()[1])
>> +    return part_offt * secsize
>> +
>> +
>> +def parse_frag_line(line: str, partition: Partition) -> Extent:
>> +    'Construct Extent instance from a "filefrag" output line.'
>> +
>> +    nums = [int(n) for n in re.findall(r'[0-9]+', line)]
>> +
>> +    log_start  = nums[1]
>> +    log_end    = nums[2]
>> +    phys_start = nums[3]
>> +    phys_end   = nums[4]
>> +    length     = nums[5]
>> +
>> +    assert log_start < log_end
>> +    assert phys_start < phys_end
>> +    assert (log_end - log_start + 1) == (phys_end - phys_start + 1) == length
>> +
>> +    return Extent(log_start, log_end, phys_start, phys_end, length, partition)
>> +
>> +
>> +def preliminary_checks(args: argparse.Namespace) -> None:
>> +    'A bunch of checks to emit an error and exit at the earlier stage.'
>> +
>> +    if which('filefrag') is None:
>> +        sys.exit('Error: Program "filefrag" doesn\'t exist')
>> +
>> +    if not os.path.exists(args.filename):
>> +        sys.exit(f'Error: File {args.filename} doesn\'t exist')
>> +
>> +    args.filesize = os.path.getsize(args.filename)
>> +    if args.offset >= args.filesize:
>> +        sys.exit(f'Error: Specified offset {args.offset} exceeds '
>> +                 f'file size {args.filesize}')
>> +    if args.size and (args.offset + args.size > args.filesize):
>> +        sys.exit(f'Error: Chunk of size {args.size} at offset '
>> +                 f'{args.offset} exceeds file size {args.filesize}')
>> +
>> +    args.partpath = fpath2part(args.filename)
>> +    args.disktype = part2disktype(args.partpath)
>> +    if args.disktype not in ('sd', 'virtblk'):
>> +        sys.exit(f'Error: Cannot analyze files on {args.disktype} disks')
>> +    args.diskpath = part2dev(args.partpath, args.filename)
>> +    args.part_offt = get_part_offset(args.partpath, args.diskpath)
>> +
>> +
>> +def get_extent_maps(args: argparse.Namespace) -> list[Extent]:
>> +    'Run "filefrag", parse its output and return a list of Extent instances.'
>> +
>> +    lines = run_cmd(f'filefrag -b1 -v {args.filename}').splitlines()
> 
> And the final missing quoting.
> 
>> +
>> +    ffinfo_re = re.compile('.* is ([0-9]+) .*of ([0-9]+) bytes')
>> +    ff_size, ff_block = re.match(ffinfo_re, lines[1]).groups()
>> +
>> +    # Paranoia checks
>> +    if int(ff_size) != args.filesize:
>> +        sys.exit('Error: filefrag and os.path.getsize() report different '
>> +                 f'sizes: {ff_size} and {args.filesize}')
>> +    if int(ff_block) != 1:
>> +        sys.exit(f'Error: "filefrag -b1" invoked, but block size is {ff_block}')
>> +
>> +    partition = Partition(args.partpath, args.diskpath, args.part_offt)
>> +
>> +    # Fill extents list from the output
>> +    extents = []
>> +    for line in lines:
>> +        if not re.match(r'^\s*[0-9]+:', line):
>> +            continue
>> +        extents += [parse_frag_line(line, partition)]
>> +
>> +    chunk_start = args.offset
>> +    chunk_end = args.offset + args.size - 1
>> +    ext_offsets = [ext.log_start for ext in extents]
>> +    start_ind = bisect_right(ext_offsets, chunk_start) - 1
>> +    end_ind = bisect_right(ext_offsets, chunk_end) - 1
>> +
>> +    res_extents = extents[start_ind : end_ind + 1]
>> +    for i, ext in enumerate(res_extents):
>> +        start = max(chunk_start, ext.log_start)
>> +        end = min(chunk_end, ext.log_end)
>> +        res_extents[i] = Extent.ext_slice(ext, start, end)
>> +
>> +    return res_extents
>> +
>> +
>> +def parse_args() -> argparse.Namespace:
>> +    'Define program arguments and parse user input.'
>> +
>> +    parser = argparse.ArgumentParser(description='''
>> +Map file offset to physical offset on the block device
>> +
>> +With --size provided get a list of mappings for the chunk''',
>> +    formatter_class=argparse.RawTextHelpFormatter)
>> +
>> +    parser.add_argument('filename', type=str, help='filename to process')
>> +    parser.add_argument('offset', type=str,
>> +                        help='logical offset inside the file')
>> +    parser.add_argument('-s', '--size', required=False, type=str,
>> +                        help='size of the file chunk to get offsets for')
>> +    args = parser.parse_args()
>> +
>> +    args.offset = parse_size(args.offset)
>> +    if args.size:
>> +        args.size = parse_size(args.size)
>> +    else:
>> +        # When no chunk size is provided (only offset), it's equivalent to
>> +        # chunk size == 1
>> +        args.size = 1
>> +
>> +    return args
>> +
>> +
>> +def main() -> int:
>> +    args = parse_args()
>> +    preliminary_checks(args)
>> +    extents = get_extent_maps(args)
>> +    for ext in extents:
>> +        print(ext)
>> +
>> +
>> +if __name__ == '__main__':
>> +    sys.exit(main())
> 
> Kevin
> 

Agreed on all the above comments, thank you.
diff mbox series

Patch

diff --git a/scripts/filev2p.py b/scripts/filev2p.py
new file mode 100755
index 0000000000..3bd7d18b5e
--- /dev/null
+++ b/scripts/filev2p.py
@@ -0,0 +1,311 @@ 
+#!/usr/bin/env python3
+#
+# Map file virtual offset to the offset on the underlying block device.
+# Works by parsing 'filefrag' output.
+#
+# Copyright (c) 2024 Virtuozzo International GmbH.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import argparse
+import os
+import subprocess
+import re
+import sys
+
+from bisect import bisect_right
+from collections import namedtuple
+from dataclasses import dataclass
+from shutil import which
+from stat import S_ISBLK
+
+
+Partition = namedtuple('Partition', ['partpath', 'diskpath', 'part_offt'])
+
+
+@dataclass
+class Extent:
+    '''Class representing an individual file extent.
+
+    This is basically a piece of data within the file which is located
+    consecutively (i.e. not sparsely) on the underlying block device.
+    '''
+
+    log_start:  int
+    log_end:    int
+    phys_start: int
+    phys_end:   int
+    length:     int
+    partition:  Partition
+
+    @property
+    def disk_start(self):
+        'Number of the first byte of this extent on the whole disk (/dev/sda)'
+        return self.partition.part_offt + self.phys_start
+
+    @property
+    def disk_end(self):
+        'Number of the last byte of this extent on the whole disk (/dev/sda)'
+        return self.partition.part_offt + self.phys_end
+
+    def __str__(self):
+        ischunk = self.log_end > self.log_start
+        maybe_end = lambda s: f'..{s}' if ischunk else ''
+        return '%s%s (file)  ->  %s%s (%s)  ->  %s%s (%s)' % (
+            self.log_start, maybe_end(self.log_end),
+            self.phys_start, maybe_end(self.phys_end), self.partition.partpath,
+            self.disk_start, maybe_end(self.disk_end), self.partition.diskpath
+        )
+
+    @classmethod
+    def ext_slice(cls, bigger_ext, start, end):
+        '''Constructor for the Extent class from a bigger extent.
+
+        Return Extent instance which is a slice of @bigger_ext contained
+        within the range [start, end].
+        '''
+
+        assert start >= bigger_ext.log_start
+        assert end <= bigger_ext.log_end
+
+        if start == bigger_ext.log_start and end == bigger_ext.log_end:
+            return bigger_ext
+
+        phys_start = bigger_ext.phys_start + (start - bigger_ext.log_start)
+        phys_end = bigger_ext.phys_end - (bigger_ext.log_end - end)
+        length = end - start + 1
+
+        return cls(start, end, phys_start, phys_end, length,
+                   bigger_ext.partition)
+
+
+def run_cmd(cmd: str) -> str:
+    '''Wrapper around subprocess.run.
+
+    Returns stdout in case of success, emits en error and exits in case
+    of failure.
+    '''
+
+    proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                          check=False, shell=True)
+    if proc.stderr is not None:
+        stderr = f'\n{proc.stderr.decode().strip()}'
+    else:
+        stderr = ''
+
+    if proc.returncode:
+        sys.exit(f'Error: Command "{cmd}" returned {proc.returncode}:{stderr}')
+
+    return proc.stdout.decode().strip()
+
+
+def parse_size(offset: str) -> int:
+    'Convert human readable size to bytes'
+
+    suffixes = {
+        **dict.fromkeys(['k', 'K', 'Kb', 'KB', 'KiB'], 2 ** 10),
+        **dict.fromkeys(['m', 'M', 'Mb', 'MB', 'MiB'], 2 ** 20),
+        **dict.fromkeys(['g', 'G', 'Gb', 'GB', 'GiB'], 2 ** 30),
+        **dict.fromkeys(     ['T', 'Tb', 'TB', 'TiB'], 2 ** 40),
+        **dict.fromkeys([''],                          1)
+    }
+
+    sizematch = re.match(r'^([0-9]+)\s*([a-zA-Z]*)$', offset)
+    if not bool(sizematch):
+        sys.exit(f'Error: Couldn\'t parse size "{offset}". Pass offset '
+                  'either in bytes or in format 1K, 2M, 3G')
+
+    num, suff = sizematch.groups()
+    num = int(num)
+
+    mult = suffixes.get(suff)
+    if mult is None:
+        sys.exit(f'Error: Couldn\'t parse size "{offset}": '
+                 f'unknown suffix {suff}')
+
+    return num * mult
+
+
+def fpath2part(filename: str) -> str:
+    'Get partition on which @filename is located (i.e. /dev/sda1).'
+
+    partpath = run_cmd(f'df --output=source {filename} | tail -n+2')
+    if not os.path.exists(partpath) or not S_ISBLK(os.stat(partpath).st_mode):
+        sys.exit(f'Error: file {filename} is located on {partpath} which '
+                 'isn\'t a block device')
+    return partpath
+
+
+def part2dev(partpath: str, filename: str) -> str:
+    'Get block device on which @partpath is located (i.e. /dev/sda).'
+    dev = run_cmd(f'lsblk -no PKNAME {partpath}')
+    diskpath = f'/dev/{dev}'
+    if not os.path.exists(diskpath) or not S_ISBLK(os.stat(diskpath).st_mode):
+        sys.exit(f'Error: file {filename} is located on {diskpath} which '
+                 'isn\'t a block device')
+    return diskpath
+
+
+def part2disktype(partpath: str) -> str:
+    'Parse /proc/devices and get block device type for @partpath'
+
+    major = os.major(os.stat(partpath).st_rdev)
+    assert major
+    with open('/proc/devices', encoding='utf-8') as devf:
+        for line in reversed(list(devf)):
+            # Our major cannot be absent among block devs
+            if line.startswith('Block'):
+                break
+            devmajor, devtype = line.strip().split()
+            if int(devmajor) == major:
+                return devtype
+
+    sys.exit('Error: We haven\'t found major {major} in /proc/devices, '
+             'and that can\'t be')
+
+
+def get_part_offset(part: str, disk: str) -> int:
+    'Get offset in bytes of the partition @part on the block device @disk.'
+
+    lines = run_cmd(f'fdisk -l {disk} | egrep "^(Units|{part})"').splitlines()
+
+    unitmatch = re.match('^.* = ([0-9]+) bytes$', lines[0])
+    if not bool(unitmatch):
+        sys.exit(f'Error: Couldn\'t parse "fdisk -l" output:\n{lines[0]}')
+    secsize = int(unitmatch.group(1))
+
+    part_offt = int(lines[1].split()[1])
+    return part_offt * secsize
+
+
+def parse_frag_line(line: str, partition: Partition) -> Extent:
+    'Construct Extent instance from a "filefrag" output line.'
+
+    nums = [int(n) for n in re.findall(r'[0-9]+', line)]
+
+    log_start  = nums[1]
+    log_end    = nums[2]
+    phys_start = nums[3]
+    phys_end   = nums[4]
+    length     = nums[5]
+
+    assert log_start < log_end
+    assert phys_start < phys_end
+    assert (log_end - log_start + 1) == (phys_end - phys_start + 1) == length
+
+    return Extent(log_start, log_end, phys_start, phys_end, length, partition)
+
+
+def preliminary_checks(args: argparse.Namespace) -> None:
+    'A bunch of checks to emit an error and exit at the earlier stage.'
+
+    if which('filefrag') is None:
+        sys.exit('Error: Program "filefrag" doesn\'t exist')
+
+    if not os.path.exists(args.filename):
+        sys.exit(f'Error: File {args.filename} doesn\'t exist')
+
+    args.filesize = os.path.getsize(args.filename)
+    if args.offset >= args.filesize:
+        sys.exit(f'Error: Specified offset {args.offset} exceeds '
+                 f'file size {args.filesize}')
+    if args.size and (args.offset + args.size > args.filesize):
+        sys.exit(f'Error: Chunk of size {args.size} at offset '
+                 f'{args.offset} exceeds file size {args.filesize}')
+
+    args.partpath = fpath2part(args.filename)
+    args.disktype = part2disktype(args.partpath)
+    if args.disktype not in ('sd', 'virtblk'):
+        sys.exit(f'Error: Cannot analyze files on {args.disktype} disks')
+    args.diskpath = part2dev(args.partpath, args.filename)
+    args.part_offt = get_part_offset(args.partpath, args.diskpath)
+
+
+def get_extent_maps(args: argparse.Namespace) -> list[Extent]:
+    'Run "filefrag", parse its output and return a list of Extent instances.'
+
+    lines = run_cmd(f'filefrag -b1 -v {args.filename}').splitlines()
+
+    ffinfo_re = re.compile('.* is ([0-9]+) .*of ([0-9]+) bytes')
+    ff_size, ff_block = re.match(ffinfo_re, lines[1]).groups()
+
+    # Paranoia checks
+    if int(ff_size) != args.filesize:
+        sys.exit('Error: filefrag and os.path.getsize() report different '
+                 f'sizes: {ff_size} and {args.filesize}')
+    if int(ff_block) != 1:
+        sys.exit(f'Error: "filefrag -b1" invoked, but block size is {ff_block}')
+
+    partition = Partition(args.partpath, args.diskpath, args.part_offt)
+
+    # Fill extents list from the output
+    extents = []
+    for line in lines:
+        if not re.match(r'^\s*[0-9]+:', line):
+            continue
+        extents += [parse_frag_line(line, partition)]
+
+    chunk_start = args.offset
+    chunk_end = args.offset + args.size - 1
+    ext_offsets = [ext.log_start for ext in extents]
+    start_ind = bisect_right(ext_offsets, chunk_start) - 1
+    end_ind = bisect_right(ext_offsets, chunk_end) - 1
+
+    res_extents = extents[start_ind : end_ind + 1]
+    for i, ext in enumerate(res_extents):
+        start = max(chunk_start, ext.log_start)
+        end = min(chunk_end, ext.log_end)
+        res_extents[i] = Extent.ext_slice(ext, start, end)
+
+    return res_extents
+
+
+def parse_args() -> argparse.Namespace:
+    'Define program arguments and parse user input.'
+
+    parser = argparse.ArgumentParser(description='''
+Map file offset to physical offset on the block device
+
+With --size provided get a list of mappings for the chunk''',
+    formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument('filename', type=str, help='filename to process')
+    parser.add_argument('offset', type=str,
+                        help='logical offset inside the file')
+    parser.add_argument('-s', '--size', required=False, type=str,
+                        help='size of the file chunk to get offsets for')
+    args = parser.parse_args()
+
+    args.offset = parse_size(args.offset)
+    if args.size:
+        args.size = parse_size(args.size)
+    else:
+        # When no chunk size is provided (only offset), it's equivalent to
+        # chunk size == 1
+        args.size = 1
+
+    return args
+
+
+def main() -> int:
+    args = parse_args()
+    preliminary_checks(args)
+    extents = get_extent_maps(args)
+    for ext in extents:
+        print(ext)
+
+
+if __name__ == '__main__':
+    sys.exit(main())