Skip to content

Instantly share code, notes, and snippets.

@agdiaz
Last active February 6, 2018 14:34
Show Gist options
  • Save agdiaz/3871397010caec0d4b2a89538b392186 to your computer and use it in GitHub Desktop.
Save agdiaz/3871397010caec0d4b2a89538b392186 to your computer and use it in GitHub Desktop.
Converts a DNA sequence in a binary file
// Compile: gcc -o dna2bit dna_bit.c
// Libraries
#include <stdio.h>
#include <string.h>
// Declare constants
#define A 0 // 00
#define C 1 // 01
#define G 2 // 10
#define T 3 // 11
#define BYTES_PER_BATCH 1
// Declare headers and structs
struct Nucleotid translate(char c);
struct Nucleotid { unsigned int base: 2; };
// Main function
// It receives as argument the path to save the dna and the sequence in the next format:
// ./dna_bit path/to/file.dna GATATATACAGGTACA
int main(int argc, char** params) {
FILE *write_ptr;
char* pathToFile = params[1];
char* sequence = params[2];
printf("Welcome to DNA2Bit version 0.0.1\n");
printf("Input sequence: %s \n", sequence);
write_ptr = fopen(pathToFile, "wb");
// Declare and first initialization of buffer variables:
unsigned int buffer = 0;
unsigned int basesInBufferCounter = 0;
unsigned int bytesWritten = 0;
// Iterate the sequence of chars
for (int baseCounter = 0; baseCounter < strlen(sequence); ++baseCounter) {
// Translate the char to the binary code
struct Nucleotid nucleotid = translate(sequence[baseCounter]);
// Puts the binary in the buffer and increments the counter
buffer = (buffer << 2) | nucleotid.base;
basesInBufferCounter++;
if (basesInBufferCounter == 4) {
// Write the 4 bases in the file (Write a byte)
fwrite(&buffer, BYTES_PER_BATCH, BYTES_PER_BATCH, write_ptr);
// Clean buffer variables
basesInBufferCounter = 0;
buffer = 0;
bytesWritten++;
}
}
// Closes the file and exit successfuly
fclose(write_ptr);
// Print statistics
printf("Total nucleotids written: %d\n", (int)strlen(sequence));
printf("Total bytes written: %d\n", bytesWritten);
printf("File written in: %s\n", pathToFile);
return 0;
}
struct Nucleotid translate(char base) {
struct Nucleotid nucleotid;
if (base == 'A' || base == 'a') {
nucleotid.base = A;
} else if (base == 'C' || base == 'c') {
nucleotid.base = C;
} else if (base == 'G' || base == 'g') {
nucleotid.base = G;
} else {
nucleotid.base = T;
}
return nucleotid;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment