Skip to content

Instantly share code, notes, and snippets.

@random82
Last active January 31, 2018 22:58
Show Gist options
  • Save random82/b9233701d78b046d7c24a603854bc3a4 to your computer and use it in GitHub Desktop.
Save random82/b9233701d78b046d7c24a603854bc3a4 to your computer and use it in GitHub Desktop.
U-SQL UDO Zip Extrator
using Microsoft.Analytics.Interfaces;
using System.Collections.Generic;
using System.IO;
namespace DocExtraction.Udo
{
[SqlUserDefinedExtractor(AtomicFileProcessing = true)]
public class ZipExtractor : IExtractor
{
private readonly bool _textMode;
public ZipExtractor(bool textMode = false)
{
_textMode = textMode;
}
public override IEnumerable<IRow> Extract(IUnstructuredReader input, IUpdatableRow output)
{
var extractor = new ZipExtractorImpl(_textMode);
using (var ms = new MemoryStream())
{
input.BaseStream.CopyTo(ms);
foreach (var result in extractor.Extract(ms))
{
output.Set(0, result.FileName);
output.Set(1, result.Content);
output.Set(2, result.Text);
yield return output.AsReadOnly();
}
}
}
}
}
using System;
using System.Collections.Generic;
using System.IO;
using System.IO.Compression;
namespace DocExtraction.Udo
{
public class ZipExtractorImpl
{
private bool _textMode;
public ZipExtractorImpl(bool textMode = false)
{
_textMode = textMode;
}
public IEnumerable<ZipExtractorResult> Extract(Stream stream)
{
using (var package = new ZipArchive(stream, ZipArchiveMode.Read))
{
foreach (var entry in package.Entries)
{
if (entry.CompressedLength == 0) continue;
using (var entryStream = entry.Open())
{
yield return new ZipExtractorResult
{
FileName = entry.FullName,
Content = _textMode ? ReadText(entryStream) : ReadBase64(entryStream),
Text = _textMode
};
}
}
}
}
private string ReadText(Stream input)
{
using (var ms = new MemoryStream())
{
CopyStream(input, ms);
ms.Position = 0;
using (var sr = new StreamReader(ms))
{
return sr.ReadToEnd();
}
}
}
private static string ReadBase64(Stream input)
{
using (var ms = new MemoryStream()) {
CopyStream(input, ms);
return Convert.ToBase64String(ms.ToArray());
}
}
private static void CopyStream(Stream source, Stream target)
{
const int bufSize = 0x1000;
byte[] buf = new byte[bufSize];
int bytesRead = 0;
while ((bytesRead = source.Read(buf, 0, bufSize)) > 0)
target.Write(buf, 0, bytesRead);
}
}
}
namespace DocExtraction.Udo
{
public class ZipExtractorResult
{
public string FileName { get; set; }
public string Content { get; set; }
public bool Text { get; set; }
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment