-
Notifications
You must be signed in to change notification settings - Fork 0
/
read_mgf.py
87 lines (68 loc) · 2.8 KB
/
read_mgf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import re
from typing import List, Dict, Union
import numpy as np
import pandas as pd
def read_mgf(file: List[str]) -> List[Dict[str, Union[Dict[str, float], np.ndarray]]]:
"""
Read and Process MGF Files
This function reads MGF files and extracts the relevant MS2 spectra information.
Duplicated peaks and noise can be optionally removed.
Parameters:
- file (list of str): List of file paths to MGF files.
Returns:
- list of dict: Processed MS2 spectra information for each MGF file, with each element containing:
- 'info': A dictionary with 'mz' and 'rt' for the precursor ion.
- 'spec': A numpy array with each row representing a fragment ion peak (m/z, intensity).
"""
ms2_data = []
for mgf_file in file:
mgf_data = ListMGF(mgf_file)
# Remove empty spectra
non_empty_mgf_data = [entry for entry in mgf_data if any(re.match(r"^\d", line) for line in entry)]
for entry in non_empty_mgf_data:
mz = extract_value(entry, pattern="^(PEPMASS|PRECURSORMZ)")
rt = extract_value(entry, pattern="^(RTINSECONDS|RETENTIONTIME|RTINMINUTES)")
# Parse spectrum data
spectrum_lines = [line for line in entry if re.match(r"^\d", line)]
spectrum = np.array([list(map(float, line.split())) for line in spectrum_lines])
# Construct info and spec dictionary
info = {'mz': mz, 'rt': rt}
ms2_data.append({'info': info, 'spec': spectrum})
# Remove spectra with no fragment data
ms2_data = [spec for spec in ms2_data if spec['spec'].size > 0]
return ms2_data
def ListMGF(file_path: str) -> List[List[str]]:
"""
Parse an MGF file into a list of spectra entries.
Parameters:
- file_path (str): Path to an MGF file.
Returns:
- list of lists: Each sublist represents one spectrum entry in the MGF file.
"""
with open(file_path, 'r') as file:
mgf_data = file.readlines()
spectra = []
entry = []
for line in mgf_data:
line = line.strip()
if line == "END IONS":
spectra.append(entry)
entry = []
else:
entry.append(line)
return spectra
def extract_value(lines: List[str], pattern: str) -> float:
"""
Extracts a numeric value from a list of strings based on a regex pattern.
Parameters:
- lines (list of str): List of strings to search.
- pattern (str): Regex pattern to identify the line containing the value.
Returns:
- float: The extracted numeric value.
"""
for line in lines:
match = re.search(pattern, line)
if match:
value = re.sub(r"[^\d.]", "", line.split('=')[-1]).strip()
return float(value)
return 0.0