Pipeline through notebooks
Preprocess output directory
[1]:
import sys
print(sys.version)
3.7.10 (default, Feb 26 2021, 18:47:35)
[GCC 7.3.0]
Import the needed packages
[14]:
from xml.etree import ElementTree as et
from dataclasses import dataclass
from pathlib import Path
from openpyxl import load_workbook
import pandas as pd
import sys
import os
import csv
from typing import Dict, List, Tuple
sys.path.append('/home/hsarkar/code/slideseq-tools/src/')
from slideseq.metadata import Manifest
from slideseq.metadata import Manifest, split_sample_lanes, validate_run_df
from slideseq.config import get_config
import slideseq.util.constants as constants
from slideseq.util import run_command
import logging
log = logging.getLogger(__name__)
[4]:
import matplotlib.pyplot as plt
%matplotlib inline
Classes
We followed the same structure created for the original slide-seqtools repository. The classes and the containing datastructure are repeatedly used for parsing the excel file containing information about the experiment. The main goal of the associated dataframe is to iterate over all the directories within different libraries to create inermediate files within the output directory.
[4]:
def get_flowcell(run_info: et.ElementTree) -> str:
"""
Get the flowcell name from RunInfo.xml. This should be more reliable than
trying to parse the run directory, which might have been renamed.
:param run_info: ElementTree representing RunInfo.xml
:return: The flowcell name for this run
"""
flowcell = run_info.find("./Run/Flowcell").text
return flowcell
def get_read_structure(run_info: et.ElementTree) -> str:
"""
Get read structure from RunInfo.xml. Assumes one index sequence only,
will warn if two are present and ignore the second.
:param run_info: ElementTree representing RunInfo.xml
:return: Formatting string representing the read structure
"""
read_elems = run_info.findall("./Run/Reads/Read[@NumCycles][@Number]")
read_elems.sort(key=lambda el: int(el.get("Number")))
if len(read_elems) == 4:
# two index reads. We will just ignore the second index
log.warning(
"This sequencing run has two index reads, we are ignoring the second one"
)
return "{}T{}B{}S{}T".format(*(el.get("NumCycles") for el in read_elems))
elif len(read_elems) != 3:
raise ValueError(f"Expected three reads, got {len(read_elems)}")
return "{}T{}B{}T".format(*(el.get("NumCycles") for el in read_elems))
def get_lanes(run_info: et.ElementTree) -> range:
"""
Return the lanes for this run, as a range
:param run_info: ElementTree representing RunInfo.xml
:return: range object for the lanes of the run
"""
lane_count = int(run_info.find("./Run/FlowcellLayout[@LaneCount]").get("LaneCount"))
return range(1, lane_count + 1)
@dataclass
class RunInfo:
"""A dataclass to represent RunInfo.xml for a sequencing run (a.k.a. flowcell)"""
run_dir: Path
flowcell: str
lanes: range
read_structure: str
@property
def demux_log(self):
return f"demultiplex.{self.flowcell}.L00$TASK_ID.log"
@property
def basecall_dir(self):
return self.run_dir / "Data" / "Intensities" / "BaseCalls"
def alignment_log(self, lane: int):
return f"alignment.{self.flowcell}.L{lane:03d}.$TASK_ID.log"
def get_run_info(run_dir: Path) -> RunInfo:
# open the RunInfo.xml file and parse it with element tree
with (run_dir / "RunInfo.xml").open() as f:
run_info = et.parse(f)
return RunInfo(
run_dir,
get_flowcell(run_info),
get_lanes(run_info),
get_read_structure(run_info),
)
[5]:
run_info = get_run_info(
Path('/home/hsarkar/code/slide-seq-data/P25255/220329_A00689_0513_BHYK23DRXY')
)
Run infomation shows that there is one flow-cell and the files are present in the run_dir
. A library contains two lanes.
[6]:
run_info
[6]:
RunInfo(run_dir=PosixPath('/home/hsarkar/code/slide-seq-data/P25255/220329_A00689_0513_BHYK23DRXY'), flowcell='HYK23DRXY', lanes=range(1, 3), read_structure='42T8B41T')
We now construct the required sheet for running further steps in slide-seq pipeline
[7]:
ws = load_workbook('/home/hsarkar/notebooks/slide_seq_analysis/2023/example_metadata.xlsx').active
from itertools import islice
data = ws.values
cols = next(data)
data = list(data)
#idx = [r[0] for r in data]
#data = (islice(r, 1, None) for r in data)
df = pd.DataFrame(data, columns=cols)
run_df = df[constants.METADATA_COLS]
run_df.columns = [c.lower() for c in constants.METADATA_COLS]
import numpy as np
run_df = run_df.fillna(value=0)
run_df = run_df.astype(constants.METADATA_TYPES)
[8]:
run_df
[8]:
library | date | flowcell | run_name | bclpath | lane | sample_barcode | bead_structure | reference | run_barcodematching | locus_function_list | start_sequence | base_quality | min_transcripts_per_cell | puckcaller_path | bead_type | gen_read1_plot | gen_downsampling | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | P25255_1001 | 2022-28-03 | Kharchenko | 1 | /home/hsarkar/code/slide-seq-data/P25255/22032... | 1 | TAAGGCGA | 8C18X6C1X8M1X | /home/hsarkar/code/slide-seq-data/slide-seq-re... | False | 0 | 0 | 0 | 0 | hiraksarkar.cs@gmail.com | /home/hsarkar/code/slide-seq-data/P25255/Barco... | 0 | False | False |
1 | P25255_1002 | 2022-28-03 | Kharchenko | 1 | /home/hsarkar/code/slide-seq-data/P25255/22032... | 1 | CGTACTAG | 8C18X6C1X8M1X | /home/hsarkar/code/slide-seq-data/slide-seq-re... | False | 0 | 0 | 0 | 0 | hiraksarkar.cs@gmail.com | /home/hsarkar/code/slide-seq-data/P25255/Barco... | 0 | False | False |
2 | P25255_1003 | 2022-28-03 | Kharchenko | 1 | /home/hsarkar/code/slide-seq-data/P25255/22032... | 1 | AGGCAGAA | 8C18X6C1X8M1X | /home/hsarkar/code/slide-seq-data/slide-seq-re... | False | 0 | 0 | 0 | 0 | hiraksarkar.cs@gmail.com | /home/hsarkar/code/slide-seq-data/P25255/Barco... | 0 | False | False |
3 | P25255_1004 | 2022-28-03 | Kharchenko | 1 | /home/hsarkar/code/slide-seq-data/P25255/22032... | 1 | TCCTGAGC | 8C18X6C1X8M1X | /home/hsarkar/code/slide-seq-data/slide-seq-re... | False | 0 | 0 | 0 | 0 | hiraksarkar.cs@gmail.com | /home/hsarkar/code/slide-seq-data/P25255/Barco... | 0 | False | False |
4 | P25255_1001 | 2022-28-03 | Kharchenko | 1 | /home/hsarkar/code/slide-seq-data/P25255/22032... | 2 | TAAGGCGA | 8C18X6C1X8M1X | /home/hsarkar/code/slide-seq-data/slide-seq-re... | False | 0 | 0 | 0 | 0 | hiraksarkar.cs@gmail.com | /home/hsarkar/code/slide-seq-data/P25255/Barco... | 0 | False | False |
5 | P25255_1002 | 2022-28-03 | Kharchenko | 1 | /home/hsarkar/code/slide-seq-data/P25255/22032... | 2 | CGTACTAG | 8C18X6C1X8M1X | /home/hsarkar/code/slide-seq-data/slide-seq-re... | False | 0 | 0 | 0 | 0 | hiraksarkar.cs@gmail.com | /home/hsarkar/code/slide-seq-data/P25255/Barco... | 0 | False | False |
6 | P25255_1003 | 2022-28-03 | Kharchenko | 1 | /home/hsarkar/code/slide-seq-data/P25255/22032... | 2 | AGGCAGAA | 8C18X6C1X8M1X | /home/hsarkar/code/slide-seq-data/slide-seq-re... | False | 0 | 0 | 0 | 0 | hiraksarkar.cs@gmail.com | /home/hsarkar/code/slide-seq-data/P25255/Barco... | 0 | False | False |
7 | P25255_1004 | 2022-28-03 | Kharchenko | 1 | /home/hsarkar/code/slide-seq-data/P25255/22032... | 2 | TCCTGAGC | 8C18X6C1X8M1X | /home/hsarkar/code/slide-seq-data/slide-seq-re... | False | 0 | 0 | 0 | 0 | hiraksarkar.cs@gmail.com | /home/hsarkar/code/slide-seq-data/P25255/Barco... | 0 | False | False |
The above data-frame are manually curated. One needs to check with the actual file-structure before creating such a worksheet. Traditionally, the researchers at Broad work with google sheets and fetch data with the sheet id.
Now we would describe the generation process step by step.
Step 1 (Preparing for demultiplexing)
Create folders and files for carrying out demultilexing
[9]:
from slideseq.metadata import Manifest
from typing import Dict, List, Tuple
import logging
log = logging.getLogger(__name__)
import csv
def gen_barcode_file(manifest: Manifest, flowcell: str, lane: int, output_file: Path):
with output_file.open("w") as out:
wtr = csv.writer(out, delimiter="\t")
wtr.writerow(("barcode_sequence_1", "library_name", "barcode_name"))
for library in manifest.libraries:
if (flowcell, lane) in library.samples:
for barcode in library.samples[flowcell, lane]:
# we don't write out barcode_name but the column is required
wtr.writerow((barcode, library.name, ""))
def gen_library_params(manifest: Manifest, flowcell: str, lane: int, output_file: Path):
with output_file.open("w") as out:
wtr = csv.writer(out, delimiter="\t")
wtr.writerow(("OUTPUT", "SAMPLE_ALIAS", "LIBRARY_NAME", "BARCODE_1"))
for sample in manifest.samples:
if sample.flowcell == flowcell and sample.lane == lane:
# output the uBAM directly to library directory
sample.lane_dir.mkdir(exist_ok=True, parents=True)
for barcode, output_ubam in zip(sample.barcodes, sample.barcode_ubams):
wtr.writerow((output_ubam, sample.name, sample.name, barcode))
def prepare_demux(run_info_list: List[RunInfo], manifest: Manifest):
"""create a bunch of directories, and write some input files for picard"""
# Create directories
log.info(
f"Creating directories in {manifest.workflow_dir} and {manifest.library_dir}"
)
for run_info in run_info_list:
for lane in run_info.lanes:
output_lane_dir = manifest.workflow_dir / run_info.flowcell / f"L{lane:03d}"
output_lane_dir.mkdir(exist_ok=True, parents=True)
(output_lane_dir / "barcodes").mkdir(exist_ok=True)
# Generate barcode_params.txt that is needed by ExtractIlluminaBarcodes
gen_barcode_file(
manifest,
run_info.flowcell,
lane,
output_lane_dir / "barcode_params.txt",
)
# Generate library_params that is needed by IlluminaBasecallsToSam
gen_library_params(
manifest,
run_info.flowcell,
lane,
output_lane_dir / "library_params.txt",
)
def validate_demux(manifest: Manifest):
"""verify that `prepare_demux` was run previously"""
if not manifest.workflow_dir.exists():
log.error(f"{manifest.workflow_dir} does not exist")
return False
for flowcell_dir in manifest.flowcell_dirs:
run_info = get_run_info(flowcell_dir)
# Create directories
log.info(f"Checking directories in {manifest.workflow_dir / run_info.flowcell}")
for lane in run_info.lanes:
output_lane_dir = manifest.workflow_dir / run_info.flowcell / f"L{lane:03d}"
for p in (
output_lane_dir,
output_lane_dir / "barcodes",
output_lane_dir / "barcode_params.txt",
output_lane_dir / "library_params.txt",
):
if not p.exists():
log.error(f"{p} does not exist, demux looks incomplete")
return False
return True
def validate_alignment(manifest: Manifest, n_libraries: int):
"""verify that alignment was run and output is present"""
for i in range(n_libraries):
library = manifest.get_library(i)
for p_list in (
library.polya_filtering_summaries,
library.star_logs,
library.alignment_pickles,
library.processed_bams,
):
for p in p_list:
if not p.exists():
log.error(f"{p} does not exist, alignment looks incomplete")
return False
else:
return True
[10]:
run_name = '1'
output_dir = Path('/home/hsarkar/code/slide-seq-data/P25255/processed_data')/\
Path(run_name)
flowcell_dirs = sorted(Path(fd) for fd in set(run_df.bclpath))
manifest_file = output_dir / "manifest.yaml"
metadata_file = output_dir / "metadata.csv"
run_info_list = [ run_info ]
config = get_config()
manifest = Manifest(
run_name=run_name,
flowcell_dirs=flowcell_dirs,
workflow_dir=output_dir,
library_dir=Path('/home/hsarkar/code/slide-seq-data/P25255/processed_data/libraries'),
metadata_file=metadata_file,
metadata=split_sample_lanes(run_df, run_info_list),
email_addresses=sorted(
set(e.strip() for v in run_df.email for e in v.split(","))
),
)
At this point we have read the config file stored within the slideseq-tools
directory, curtailed to our need
[11]:
config
[11]:
Config(picard=PosixPath('/home/hsarkar/code/slideseq-tools/soft/picard/build/libs/picard.jar'), dropseq_dir=PosixPath('/home/hsarkar/code/slideseq-tools/soft/Drop-seq_tools-2.5.1'), reference_dir=PosixPath('/broad/macosko/reference'), workflow_dir=PosixPath('/broad/macosko/data/workflows/flowcell'), library_dir=PosixPath('/broad/macosko/data/libraries'), gsecret_name='projects/velina-208320/secrets/sequencing-credentials/versions/latest', gsheet_id='1kwnKrkbl80LyE9lND0UZZJXipL4yfBbGjkTe6hcwJic', worksheet='Experiment Log', gs_path=PosixPath('macosko_data/libraries'))
[12]:
manifest
[12]:
Manifest(run_name='1', flowcell_dirs=[PosixPath('/home/hsarkar/code/slide-seq-data/P25255/220329_A00689_0513_BHYK23DRXY')], workflow_dir=PosixPath('/home/hsarkar/code/slide-seq-data/P25255/processed_data/1'), library_dir=PosixPath('/home/hsarkar/code/slide-seq-data/P25255/processed_data/libraries'), metadata_file=PosixPath('/home/hsarkar/code/slide-seq-data/P25255/processed_data/1/metadata.csv'), metadata= library date flowcell run_name \
0 P25255_1001 2022-28-03 HYK23DRXY 1
1 P25255_1002 2022-28-03 HYK23DRXY 1
2 P25255_1003 2022-28-03 HYK23DRXY 1
3 P25255_1004 2022-28-03 HYK23DRXY 1
4 P25255_1001 2022-28-03 HYK23DRXY 1
5 P25255_1002 2022-28-03 HYK23DRXY 1
6 P25255_1003 2022-28-03 HYK23DRXY 1
7 P25255_1004 2022-28-03 HYK23DRXY 1
bclpath lane sample_barcode \
0 /home/hsarkar/code/slide-seq-data/P25255/22032... 1 TAAGGCGA
1 /home/hsarkar/code/slide-seq-data/P25255/22032... 1 CGTACTAG
2 /home/hsarkar/code/slide-seq-data/P25255/22032... 1 AGGCAGAA
3 /home/hsarkar/code/slide-seq-data/P25255/22032... 1 TCCTGAGC
4 /home/hsarkar/code/slide-seq-data/P25255/22032... 2 TAAGGCGA
5 /home/hsarkar/code/slide-seq-data/P25255/22032... 2 CGTACTAG
6 /home/hsarkar/code/slide-seq-data/P25255/22032... 2 AGGCAGAA
7 /home/hsarkar/code/slide-seq-data/P25255/22032... 2 TCCTGAGC
bead_structure reference \
0 8C18X6C1X8M1X /home/hsarkar/code/slide-seq-data/slide-seq-re...
1 8C18X6C1X8M1X /home/hsarkar/code/slide-seq-data/slide-seq-re...
2 8C18X6C1X8M1X /home/hsarkar/code/slide-seq-data/slide-seq-re...
3 8C18X6C1X8M1X /home/hsarkar/code/slide-seq-data/slide-seq-re...
4 8C18X6C1X8M1X /home/hsarkar/code/slide-seq-data/slide-seq-re...
5 8C18X6C1X8M1X /home/hsarkar/code/slide-seq-data/slide-seq-re...
6 8C18X6C1X8M1X /home/hsarkar/code/slide-seq-data/slide-seq-re...
7 8C18X6C1X8M1X /home/hsarkar/code/slide-seq-data/slide-seq-re...
run_barcodematching locus_function_list start_sequence base_quality \
0 False 0 0 0
1 False 0 0 0
2 False 0 0 0
3 False 0 0 0
4 False 0 0 0
5 False 0 0 0
6 False 0 0 0
7 False 0 0 0
min_transcripts_per_cell email \
0 0 hiraksarkar.cs@gmail.com
1 0 hiraksarkar.cs@gmail.com
2 0 hiraksarkar.cs@gmail.com
3 0 hiraksarkar.cs@gmail.com
4 0 hiraksarkar.cs@gmail.com
5 0 hiraksarkar.cs@gmail.com
6 0 hiraksarkar.cs@gmail.com
7 0 hiraksarkar.cs@gmail.com
puckcaller_path bead_type \
0 /home/hsarkar/code/slide-seq-data/P25255/Barco... 0
1 /home/hsarkar/code/slide-seq-data/P25255/Barco... 0
2 /home/hsarkar/code/slide-seq-data/P25255/Barco... 0
3 /home/hsarkar/code/slide-seq-data/P25255/Barco... 0
4 /home/hsarkar/code/slide-seq-data/P25255/Barco... 0
5 /home/hsarkar/code/slide-seq-data/P25255/Barco... 0
6 /home/hsarkar/code/slide-seq-data/P25255/Barco... 0
7 /home/hsarkar/code/slide-seq-data/P25255/Barco... 0
gen_read1_plot gen_downsampling
0 False False
1 False False
2 False False
3 False False
4 False False
5 False False
6 False False
7 False False , email_addresses=['hiraksarkar.cs@gmail.com'])
Create folders using the manifest files
[ ]:
prepare_demux(run_info_list, manifest)
Step 2 Demultiplex (Has to be run in command line)
demultiplex.sh
should contain the locations that can be determined from the worksheet and the manifest file
TMP_DIR="/home/hsarkar/code/slide-seq-data/P25255/processed_data/tmp"
PICARD_JAR="/home/hsarkar/code/slideseq-tools/soft/picard/build/libs/picard.jar"
SGE_TASK_ID="2"
BASECALLS_DIR="/home/hsarkar/code/slide-seq-data/P25255/220329_A00689_0513_BHYK23DRXY/Data/Intensities/BaseCalls"
READ_STRUCTURE="42T8B41T"
OUTPUT_DIR="/home/hsarkar/code/slide-seq-data/P25255/processed_data/1"
FLOWCELL="HYK23DRXY"
Run the following script
./scripts/demultiplex.sh
We can see the created files here
[13]:
n_libraries = len(list(manifest.libraries))
for library_index in range(n_libraries):
for run_info in run_info_list:
for lane in run_info.lanes:
print('[{},{},{}]'.format(
library_index,run_info.flowcell, lane))
sample = manifest.get_sample(
library_index, run_info.flowcell, lane)
for barcode_ubam in sample.barcode_ubams:
print(barcode_ubam)
[0,HYK23DRXY,1]
/home/hsarkar/code/slide-seq-data/P25255/processed_data/libraries/2022-28-03_P25255_1001/HYK23DRXY/L001/P25255_1001.TAAGGCGA.unmapped.bam
[0,HYK23DRXY,2]
/home/hsarkar/code/slide-seq-data/P25255/processed_data/libraries/2022-28-03_P25255_1001/HYK23DRXY/L002/P25255_1001.TAAGGCGA.unmapped.bam
[1,HYK23DRXY,1]
/home/hsarkar/code/slide-seq-data/P25255/processed_data/libraries/2022-28-03_P25255_1002/HYK23DRXY/L001/P25255_1002.CGTACTAG.unmapped.bam
[1,HYK23DRXY,2]
/home/hsarkar/code/slide-seq-data/P25255/processed_data/libraries/2022-28-03_P25255_1002/HYK23DRXY/L002/P25255_1002.CGTACTAG.unmapped.bam
[2,HYK23DRXY,1]
/home/hsarkar/code/slide-seq-data/P25255/processed_data/libraries/2022-28-03_P25255_1003/HYK23DRXY/L001/P25255_1003.AGGCAGAA.unmapped.bam
[2,HYK23DRXY,2]
/home/hsarkar/code/slide-seq-data/P25255/processed_data/libraries/2022-28-03_P25255_1003/HYK23DRXY/L002/P25255_1003.AGGCAGAA.unmapped.bam
[3,HYK23DRXY,1]
/home/hsarkar/code/slide-seq-data/P25255/processed_data/libraries/2022-28-03_P25255_1004/HYK23DRXY/L001/P25255_1004.TCCTGAGC.unmapped.bam
[3,HYK23DRXY,2]
/home/hsarkar/code/slide-seq-data/P25255/processed_data/libraries/2022-28-03_P25255_1004/HYK23DRXY/L002/P25255_1004.TCCTGAGC.unmapped.bam
After demultiplexing it multiple unmapped bamfiles will be created for each barcode within lane. If there is one sample_barcode
for each lane then it will only create one file.
Step 3 Merge the barcode.unmapped ubams and
[ ]:
n_libraries = len(list(manifest.libraries))
for library_index in range(n_libraries):
for run_info in run_info_list:
for lane in run_info.lanes:
print('[{},{},{}]'.format(
library_index,run_info.flowcell, lane))
sample = manifest.get_sample(
library_index, run_info.flowcell, lane)
for barcode_ubam in sample.barcode_ubams:
print(barcode_ubam)
bead_structure = sample.get_bead_structure()
xc_range = ":".join(f"{i}-{j}" for c, i, j in bead_structure if c == "C")
xm_range = ":".join(f"{i}-{j}" for c, i, j in bead_structure if c == "M")
if not sample.raw_ubam.exists():
if len(sample.barcode_ubams) > 1:
cmd = config.picard_cmd("MergeSamFiles", manifest.tmp_dir)
cmd.extend(
[
"--OUTPUT",
sample.raw_ubam,
"--SORT_ORDER",
"unsorted",
"--ASSUME_SORTED",
"true",
]
)
for ubam_file in sample.barcode_ubams:
cmd.extend(["--INPUT", ubam_file])
run_command(cmd, "MergeBamFiles", sample)
else:
if sample.raw_ubam.exists():
print(sample.raw_ubam)
else:
os.rename(sample.barcode_ubams[0], sample.raw_ubam)
Step 4 recover whitelisted barcodes (Takes long run from command prompt)
Step 4a: Extract the barcodes (42bp) from the bam files
[17]:
fp = open('../../../scripts/write_shell.txt','w')
for library_index in range(n_libraries):
for run_info in run_info_list:
for lane in run_info.lanes:
print('[{},{},{}]'.format(
library_index,run_info.flowcell, lane))
sample = manifest.get_sample(
library_index, run_info.flowcell, lane)
if sample is None:
print('sample not present')
awk_command = "awk 'NR%2 {{print > \"{}\"}}'".format(str(sample.raw_barcode))
cmd = [
"samtools",
"view",
str(sample.raw_ubam),
"|",
"cut -f10 |",
awk_command
]
fp.write("{}\n".format(' '.join(cmd)))
fp.close()
[0,HYK23DRXY,1]
[0,HYK23DRXY,2]
[1,HYK23DRXY,1]
[1,HYK23DRXY,2]
[2,HYK23DRXY,1]
[2,HYK23DRXY,2]
[3,HYK23DRXY,1]
[3,HYK23DRXY,2]
Run the commands
parallel -j 8 < /home/hsarkar/code/slideseq-tools/write_shell.txt
Step 4b: Create whitelist (Ideally one should borrow information from image analysis)
All the follwoing commands should be run from command line and not notebook
The barcodes will be written using the following script
python scripts/createwhitelist.py
Step 5: Create the refined bam files
python scripts/recoverbam.py
Step 6: Run the alignment step
python scripts/alignment.py
Step 7: Run processing pipeline
python scripts/processing.py
[18]:
!pwd
/home/hsarkar/code/slide-seq-pipeline/docs/source/notebooks